# init dataset and modelword2vec = Word2Vec(data_path='text8', vocabulary_size=50000, embedding_size=300)# the index of the whole corpusprint(word2vec.data[:10])# word_count like this [['word', word_count], ...]# the index of list correspond index of wordprint(word2vec.word_count[:10])# index to wordprint(word2vec.index2word[34])# word to indexprint(word2vec.word2index['hello'])
Train and get the vector.
# train modelword2vec.train(train_steps=200000, skip_window=1, num_skips=2, num_neg=20, output_dir='out/run-1')# save vector txt fileword2vec.save_vector_txt(path_dir='out/run-1')# get vector listvector = word2vec.get_list_vector()print(vector[123])print(vector[word2vec.word2index['hello']])# get top k similar wordsim_list = word2vec.most_similar('one', top_k=8)print(sim_list)# load pre-train modelword2vec.load_model('out/run-1/model_step200000.pt')