import matplotlib.pyplot as plt
import numpy as np

import gensim.downloader
# show all available models
print('\n'.join(gensim.downloader.info()['models'].keys()))

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis

#word2vec_model = gensim.downloader.load('glove-twitter-25')
word2vec_model = gensim.downloader.load('glove-wiki-gigaword-50')

[==================================================] 100.0% 66.0/66.0MB downloaded

# show some of the model's vocabulary. Note the twitter-isms, some of which came from the special tokenizer used for this model.
print(', '.join(word2vec_model.index_to_key[:100]))

the, ,, ., of, to, and, in, a, ", 's, for, -, that, on, is, was, said, with, he, as, it, by, at, (, ), from, his, '', ``, an, be, has, are, have, but, were, not, this, who, they, had, i, which, will, their, :, or, its, one, after, new, been, also, we, would, two, more, ', first, about, up, when, year, there, all, --, out, she, other, people, n't, her, percent, than, over, into, last, some, government, time, $, you, years, if, no, world, can, three, do, ;, president, only, state, million, could, us, most, _, against, u.s.

word2vec_model.vector_size

50

# show the model's vector for a word
print(word2vec_model['cat'])
print(word2vec_model['cat'].shape)

[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]
(50,)

# show the model's vector for several words side by side as heatmaps
def show_heatmap(word, ax, vec=None, cmap='BrBG'):
    if vec is None:
        vec = word2vec_model[word]
    ax.imshow(vec.reshape(5, -1), cmap=cmap)
    ax.set_title(word)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
# cat:
show_heatmap('cat', axs[0, 0])
# dog: 
show_heatmap('dog', axs[0, 1])
# house
show_heatmap('house', axs[1, 0])
# pet
show_heatmap('pet', axs[1, 1])

# show dot products, component by component, for each vector with "cat"
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
# cat: (use a diverging colormap to show positive and negative values)
show_heatmap('cat', axs[0, 0], word2vec_model['cat'] * word2vec_model['cat'], cmap='BrBG')
# dog: 
show_heatmap('dog', axs[0, 1], word2vec_model['cat'] * word2vec_model['dog'], cmap='BrBG')
# house
show_heatmap('house', axs[1, 0], word2vec_model['cat'] * word2vec_model['house'], cmap='BrBG')
# pet
show_heatmap('pet', axs[1, 1], word2vec_model['cat'] * word2vec_model['pet'], cmap='BrBG')
# show color legend
fig.colorbar(axs[0, 0].images[0], ax=axs.ravel().tolist(), orientation='horizontal')

<matplotlib.colorbar.Colorbar at 0x1bade9c30>

# Show the dot products of "cat" with "cat", "dog", "house", and "pet".
# Note that "cat" and "dog" are similar, but "cat" and "house" are not.
for word in ['cat', 'dog', 'house', 'pet']:
    print(f'{word:5s}: {np.dot(word2vec_model["cat"], word2vec_model[word]):.2f}')

cat  : 19.43
dog  : 19.74
house: 6.94
pet  : 16.15

# Now normalize the vectors first
cat_normalized = word2vec_model['cat'] / np.linalg.norm(word2vec_model['cat'])
for word in ['cat', 'dog', 'house', 'pet']:
    normalized_vec = word2vec_model[word] / np.linalg.norm(word2vec_model[word])
    print(f'{word:5s}: {np.dot(cat_normalized, normalized_vec):.2f}')

cat  : 1.00
dog  : 0.92
house: 0.31
pet  : 0.78

word2vec_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('queen', 0.9288908839225769),
 ('throne', 0.882325291633606),
 ('elizabeth', 0.8789501786231995),
 ('princess', 0.8767548203468323),
 ('daughter', 0.8705160617828369),
 ('prince', 0.8702554702758789),
 ('kingdom', 0.8607221841812134),
 ('eldest', 0.8595449328422546),
 ('monarch', 0.8584721684455872),
 ('widow', 0.8549265265464783)]

comparatives = [
    word2vec_model['faster'] - word2vec_model['fast'],
    word2vec_model['better'] - word2vec_model['good'],
    word2vec_model['stronger'] - word2vec_model['strong'],
]
# same thing, except use the normalized vectors
comparatives_normalized = [
    word2vec_model.get_vector('faster', norm=True) - word2vec_model.get_vector('fast', norm=True),
    word2vec_model.get_vector('better', norm=True) - word2vec_model.get_vector('good', norm=True),
    word2vec_model.get_vector('stronger', norm=True) - word2vec_model.get_vector('strong', norm=True),
]

comparative_vec = np.mean(comparatives, axis=0)
comparative_vec.shape

(50,)

short_vec = word2vec_model.get_vector('dark', norm=False)
short_comparative = short_vec + comparative_vec
word2vec_model.similar_by_vector(short_comparative, topn=5)
#word2vec_model.most_similar_cosmul(positive=['clear', 'faster'], negative=['fast'], topn=5)

[('dark', 0.9151450395584106),
 ('darker', 0.8416261672973633),
 ('shadows', 0.798949658870697),
 ('shades', 0.7860827445983887),
 ('eyes', 0.7606992125511169)]

apple = word2vec_model['apple']
tree = word2vec_model['tree']

grows_on = apple - tree

grape = word2vec_model['grape']
word2vec_model.most_similar(positive=['apple', 'tree'], negative=['grape'])

[('windows', 0.7277389764785767),
 ('sun', 0.6778556108474731),
 ('mac', 0.6661618947982788),
 ('door', 0.6645179986953735),
 ('memory', 0.6577113270759583),
 ('platform', 0.6541374325752258),
 ('phone', 0.6472747325897217),
 ('iphone', 0.6441183686256409),
 ('computer', 0.6348914504051208),
 ('pc', 0.6300128698348999)]

# get the word vector for the word 'king'
king = model['king']

# get the word vector for the word 'queen'
queen = model['queen']

# find the vector most similar to the vector 'king' + 'queen'
model.most_similar(positive=[king, queen])

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb Cell 11 in <cell line: 3>()
      <a href='vscode-notebook-cell:/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb#W2sZmlsZQ%3D%3D?line=0'>1</a> # Load the word embeddings
      <a href='vscode-notebook-cell:/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb#W2sZmlsZQ%3D%3D?line=1'>2</a> import gensim
----> <a href='vscode-notebook-cell:/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb#W2sZmlsZQ%3D%3D?line=2'>3</a> model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
      <a href='vscode-notebook-cell:/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb#W2sZmlsZQ%3D%3D?line=4'>5</a> # get the word vector for the word 'king'
      <a href='vscode-notebook-cell:/Users/ka37/Dropbox/ken/Courses/cs344/static/fundamentals/u09n1-word-embeddings.ipynb#W2sZmlsZQ%3D%3D?line=5'>6</a> king = model['king']

File /usr/local/Caskroom/miniconda/base/envs/fa22/lib/python3.10/site-packages/gensim/models/keyedvectors.py:1719, in KeyedVectors.load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header)
   1672 @classmethod
   1673 def load_word2vec_format(
   1674         cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
   1675         limit=None, datatype=REAL, no_header=False,
   1676     ):
   1677     """Load KeyedVectors from a file produced by the original C word2vec-tool format.
   1678 
   1679     Warnings
   (...)
   1717 
   1718     """
-> 1719     return _load_word2vec_format(
   1720         cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
   1721         limit=limit, datatype=datatype, no_header=no_header,
   1722     )

File /usr/local/Caskroom/miniconda/base/envs/fa22/lib/python3.10/site-packages/gensim/models/keyedvectors.py:2048, in _load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype, no_header, binary_chunk_size)
   2045             counts[word] = int(count)
   2047 logger.info("loading projection weights from %s", fname)
-> 2048 with utils.open(fname, 'rb') as fin:
   2049     if no_header:
   2050         # deduce both vocab_size & vector_size from 1st pass over file
   2051         if binary:

File /usr/local/Caskroom/miniconda/base/envs/fa22/lib/python3.10/site-packages/smart_open/smart_open_lib.py:188, in open(uri, mode, buffering, encoding, errors, newline, closefd, opener, ignore_ext, compression, transport_params)
    185 if transport_params is None:
    186     transport_params = {}
--> 188 fobj = _shortcut_open(
    189     uri,
    190     mode,
    191     compression=compression,
    192     buffering=buffering,
    193     encoding=encoding,
    194     errors=errors,
    195     newline=newline,
    196 )
    197 if fobj is not None:
    198     return fobj

File /usr/local/Caskroom/miniconda/base/envs/fa22/lib/python3.10/site-packages/smart_open/smart_open_lib.py:361, in _shortcut_open(uri, mode, compression, buffering, encoding, errors, newline)
    358 if errors and 'b' not in mode:
    359     open_kwargs['errors'] = errors
--> 361 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)

FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin'

An exercise on bias in word embeddings.¶

Directions are meaningful¶