1 nltk

1.1 n-grams

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

text = "I need to write a program in NLTK that breaks a corpus (a large collection of \
txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams."

token = nltk.word_tokenize(text)
bigrams = ngrams(token,2)
counter = Counter(bigrams)
print(counter)
# Counter({('I', 'need'): 1, ('need', 'to'): 1, ('to', 'write'): 1, ('write', 'a'): 1, ('a', 'program'): 1, ('program', 'in'): 1, ('in', 'NLTK'): 1, ('NLTK', 'that'): 1, ('that', 'breaks'): 1, ('breaks', 'a'): 1, ('a', 'corpus'): 1, ('corpus', '('): 1, ('(', 'a'): 1, ('a', 'large'): 1, ('large', 'collection'): 1, ('collection', 'of'): 1, ('of', 'txt'): 1, ('txt', 'files'): 1, ('files', ')')....

# convert counter to dataframe
counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
counter_df = counter_df.rename(columns={'index':'event', 0:'count'})
counter_df
counter_df

#                   event  count
# 0             (I, need)      1
# 1            (need, to)      1
# 2           (to, write)      1

2 spacy

import spacy
import numpy as np

nlp = spacy.load("en")

tokens1 = nlp("dog cat banana afskfsd")
for token in tokens1:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
    
dog True 19.266302 True
cat True 19.220264 True
banana True 17.748499 True
afskfsd True 20.882006 True


# (all packages that end in sm) don’t ship with word vectors, and only include context-sensitive tensors.
# so same token has different vector value comparing above
tokens2 = nlp("cat banana afskfsd dog")
for token in tokens2:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
    
cat True 17.908854 True
banana True 18.362368 True
afskfsd True 20.619379 True
dog True 17.855104 True

# Word Vector Representation
# each document is converted into a vector with 96 values
print(tokens1.vector.shape)
# (96,)
print(tokens2.vector.shape)
# (96,)


tokens3 = nlp("dog cat banana bybe")

tokens1.similarity(tokens2)
# 0.9431841314260427
tokens1.similarity(tokens3)
# 0.8992161555367724

2.1 中文

# 非官方中文模型地址:https://github.com/howl-anderson/Chinese_models_for_SpaCy
# 下载后执行: pip install ./zh_core_web_sm-2.0.5.tar.gz

import spacy
nlp = spacy.load("zh_core_web_sm")
test_doc_1 = nlp("测试一下分词")
print(test_doc_1.vector)

for token in test_doc_1:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
    
test_doc_2 = nlp("这是什么东西")
test_doc_3 = nlp("测试分词")
print(test_doc_1.similarity(test_doc_2))
print(test_doc_1.similarity(test_doc_3))

doc = nlp('西门子将努力参与中国的三峡工程建设。')
也可以使用spaCy内置的可视化工具:

from spacy import displacy
displacy.render(doc,type='ent')
Home Page