import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
= "I need to write a program in NLTK that breaks a corpus (a large collection of \
text txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams."
= nltk.word_tokenize(text)
token = ngrams(token,2)
bigrams = Counter(bigrams)
counter print(counter)
# Counter({('I', 'need'): 1, ('need', 'to'): 1, ('to', 'write'): 1, ('write', 'a'): 1, ('a', 'program'): 1, ('program', 'in'): 1, ('in', 'NLTK'): 1, ('NLTK', 'that'): 1, ('that', 'breaks'): 1, ('breaks', 'a'): 1, ('a', 'corpus'): 1, ('corpus', '('): 1, ('(', 'a'): 1, ('a', 'large'): 1, ('large', 'collection'): 1, ('collection', 'of'): 1, ('of', 'txt'): 1, ('txt', 'files'): 1, ('files', ')')....
# convert counter to dataframe
= pd.DataFrame.from_dict(counter, orient='index').reset_index()
counter_df = counter_df.rename(columns={'index':'event', 0:'count'})
counter_df
counter_df
counter_df
# event count
# 0 (I, need) 1
# 1 (need, to) 1
# 2 (to, write) 1
import spacy
import numpy as np
= spacy.load("en")
nlp
= nlp("dog cat banana afskfsd")
tokens1 for token in tokens1:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
True 19.266302 True
dog True 19.220264 True
cat True 17.748499 True
banana True 20.882006 True
afskfsd
# (all packages that end in sm) don’t ship with word vectors, and only include context-sensitive tensors.
# so same token has different vector value comparing above
= nlp("cat banana afskfsd dog")
tokens2 for token in tokens2:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
True 17.908854 True
cat True 18.362368 True
banana True 20.619379 True
afskfsd True 17.855104 True
dog
# Word Vector Representation
# each document is converted into a vector with 96 values
print(tokens1.vector.shape)
# (96,)
print(tokens2.vector.shape)
# (96,)
= nlp("dog cat banana bybe")
tokens3
tokens1.similarity(tokens2)# 0.9431841314260427
tokens1.similarity(tokens3)# 0.8992161555367724
# 非官方中文模型地址:https://github.com/howl-anderson/Chinese_models_for_SpaCy
# 下载后执行: pip install ./zh_core_web_sm-2.0.5.tar.gz
import spacy
= spacy.load("zh_core_web_sm")
nlp = nlp("测试一下分词")
test_doc_1 print(test_doc_1.vector)
for token in test_doc_1:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
= nlp("这是什么东西")
test_doc_2 = nlp("测试分词")
test_doc_3 print(test_doc_1.similarity(test_doc_2))
print(test_doc_1.similarity(test_doc_3))
= nlp('西门子将努力参与中国的三峡工程建设。')
doc
也可以使用spaCy内置的可视化工具:
from spacy import displacy
type='ent') displacy.render(doc,