##corpus 语料处理
###nltk’s book 基本操作
from nltk.book import *
text1.concordance("monstrous") #查看monstrous的上下文
text1.similar("monstrous") #查看上下文相同的词
text1.common_contexts(["monstrous", "very"]) #相同的上下文
text1.generate() #输出随机段落
词频统计
#freqdist 统计词频
from nltk import *
from nltk.book import *
fdist1 = freqdist(text1)
fdist1['whale'] #whale这个词出现的次数
fdist1.plot(50, cumulative=true) #统计图,前50个词的累积图
#!!高频词太常见,低频词太多
#长度大于7并且频次大于7
fdist5 = FreqDist(text5)
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)
搭配和双连词
#tool
bigrams(["more", "is", "said", "than"])
text4.collocations() #输出text4的高频双连词
###NLTK频率分布类中定义的函数
fdist = FreqDist(samples)
fdist.inc(samples)
fdist['monstrous']
fdist.freq('monstrous')
fdist.N()
fdist.keys()
for sample in fdist: #descending
fdist.max()
fdist.tabulate()
fdist.plot()
fdist.plot(cumulative=True)
fdist1 < fdist2
###处理网络信息
from urllib2 import urlopen
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
text.concordance('gene')
###work flow
#html->ascii->text->vocab
html = urlopen(url).read()
raw = nltk.clean_html(html)
raw = raw[xx : xxx]
tokens = nltk.word_tokenize(raw)
tokens = tokens[xx: xxx]
text = nltk.Text(tokens)
words = [w.lower() for w in text]
vocab = sorted(set(words))
###对中文进行分析
words = open('xxx.data', 'r').read().decode('utf-8').split(' ')
fdist = FreqDist(words)
from nltk.corpus import PlaintextCorpusReader
corpus_root = './dict'
wordlists = PlaintextCorpusReader(corpus_root, ".*")
wordlists.fileids()
wordlist.words()
nltk.Text()
unicode_text = u'\u015bwiatowej'
unicode_text = unicode_text.encode("utf-8")
tokens = nltk.word_tokenize(unicode_text)
text = nltk.Text(tokens)
###wordnet
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names #同义词集
wn.synset('car.n.01').definition #定义
wn.synset('car.n.01').examples
###unicode
import codecs
f = codecs.open(path, encoding="utf-8")