# Install the needed libraries
!pip install -q sentence-transformers

# Set up TensorBoard to view the embeddings.

import tensorflow as tf
import tensorboard as tb
from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard

# Import libraries we'll need
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time

# Load the model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('all-mpnet-base-v2')

# We download the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
# and find similar questions in it
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 5000 # We limit our corpus to only the first 5k questions


# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break
corpus_sentences = list(corpus_sentences)

corpus_sentences[:3]

len(corpus_sentences)

corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

corpus_embeddings[2].shape

# Write the embeddings to a file so that the projector can view them.
writer = SummaryWriter()
writer.add_embedding(corpus_embeddings, corpus_sentences)
writer.close()

%tensorboard --logdir=runs

start_time = time.time()

# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least a certain number of sentences
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=0.75)

print("Clustering done after {:.2f} sec".format(time.time() - start_time))

# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCommunity {} ({} sentences)".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

corpus_embeddings

corpus_embeddings.shape

# let's look for a few example sentences by keywords
gmail_sents = [(i, sent) for i, sent in enumerate(corpus_sentences) if 'password' in sent and 'gmail' in sent.lower()]
gmail_sents

sentence_idx = gmail_sents[0][0]
print("Getting the vector for sentence {}: \"{}\"".format(sentence_idx, corpus_sentences[sentence_idx]))
vec = corpus_embeddings[sentence_idx]

print("The vector has", len(vec), "elements.")

similarity_scores = corpus_embeddings.matmul(vec)
similarity_scores

[corpus_sentences[i] for i in similarity_scores.topk(15).indices]

Sentence Embeddings¶

Install and Import¶

Load Model and Data¶

Compute Sentence Vectors¶

Visualize Sentence Vectors¶

Find Clusters¶

How does it work?¶

Looking for Similar Vectors¶