本文整理汇总了Python中nltk.probability.FreqDist类的典型用法代码示例。如果您正苦于以下问题:Python FreqDist类的具体用法?Python FreqDist怎么用?Python FreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FreqDist类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: create_word_scores
def create_word_scores(posWords,negWords,posTag,negTag):
from nltk.probability import FreqDist, ConditionalFreqDist
import itertools
posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*negWords)) #同理
word_fd = FreqDist() #可统计所有词的词频
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
#help(FreqDist)
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd[posTag].N() #积极词的数量
neg_word_count = cond_word_fd[negTag].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:27,代码来源:extractFeatures.py
示例2: summarize
def summarize(self, input, num_sentences ):
s=[]
punt_list=['.',',','!','?']
summ_sentences = []
sentences=input
#sentences = sent_tokenize(input)
lowercase_sentences =[sentence.lower()
for sentence in sentences]
#print lowercase_sentences
saito=' '.join(sentences)
s=input
ts=''.join([ o for o in s if not o in punt_list ]).split()
lowercase_words=[word.lower() for word in ts]
words = [word for word in lowercase_words if word not in stopwords.words()]
word_frequencies = FreqDist(words)
most_frequent_words = [pair[0] for pair in
word_frequencies.items()[:100]]
# add sentences with the most frequent words
if(len(s) < num_sentences):
num_sentences=len(s)
for word in most_frequent_words:
for i in range(len(lowercase_sentences)):
if len(summ_sentences) < num_sentences:
if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]):
summ_sentences.append(lowercase_sentences[i])
else:
break
if len(summ_sentences) >= num_sentences:
break
# reorder the selected sentences
summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) )
return summ_sentences
开发者ID:benjbigot,项目名称:BNN_WIN,代码行数:35,代码来源:naivesumm.py
示例3: create_words_bigrams_scores
def create_words_bigrams_scores():
posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word]+=1
cond_word_fd['pos'][word]+=1
for word in neg:
word_fd[word]+=1
cond_word_fd['neg'][word]+=1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:wac81,项目名称:LSI-for-ChineseDocument,代码行数:35,代码来源:pos_neg_ml_feature.py
示例4: make_summary
def make_summary( text):
sent = []
stemmed = []
tokens = word_tokenize(text)
sent = sent_tokenize(text)
for token in tokens:
if token in stopwords.words('english'):
tokens.remove(token)
stemmer = PorterStemmer()
for token in tokens:
stemmed.append(stemmer.stem(token))
#freq(stemmed)
for word in stemmed:
word.lower()
word_freq = FreqDist(stemmed)
most_freq_words = [pair[0] for pair in word_freq.items()[:60]]
working_sent = [sentence.lower() for sentence in sent]
out_sent = []
for word in most_freq_words:
for i in range(0,len(working_sent)):
if (word in working_sent[i] and sent[i] not in out_sent):
out_sent.append(sent[i])
break
if len(out_sent) >= 5:
break
if len(out_sent) >= 5:
break
return reorder(out_sent,text)
开发者ID:aigeano,项目名称:Summaly,代码行数:35,代码来源:summaly.py
示例5: train_supervised
def train_supervised(self, labelled_sequences, **kwargs):
"""
Supervised training maximising the joint probability of the symbol and
state sequences. This is done via collecting frequencies of
transitions between states, symbol observations while within each
state and which states start a sentence. These frequency distributions
are then normalised into probability estimates, which can be
smoothed if desired.
:return: the trained model
:rtype: HiddenMarkovModelTagger
:param labelled_sequences: the training data, a set of
labelled sequences of observations
:type labelled_sequences: list
:param kwargs: may include an 'estimator' parameter, a function taking
a FreqDist and a number of bins and returning a CProbDistI;
otherwise a MLE estimate is used
"""
# default to the MLE estimate
estimator = kwargs.get('estimator')
if estimator is None:
estimator = lambda fdist, bins: MLEProbDist(fdist)
# count occurrences of starting states, transitions out of each state
# and output symbols observed in each state
known_symbols = set(self._symbols)
known_states = set(self._states)
starting = FreqDist()
transitions = ConditionalFreqDist()
outputs = ConditionalFreqDist()
for sequence in labelled_sequences:
lasts = None
for token in sequence:
state = token[_TAG]
symbol = token[_TEXT]
if lasts is None:
starting.inc(state)
else:
transitions[lasts].inc(state)
outputs[state].inc(symbol)
lasts = state
# update the state and symbol lists
if state not in known_states:
self._states.append(state)
known_states.add(state)
if symbol not in known_symbols:
self._symbols.append(symbol)
known_symbols.add(symbol)
# create probability distributions (with smoothing)
N = len(self._states)
pi = estimator(starting, N)
A = ConditionalProbDist(transitions, estimator, N)
B = ConditionalProbDist(outputs, estimator, len(self._symbols))
return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
开发者ID:pierrefribourg,项目名称:nltk,代码行数:60,代码来源:hmm.py
示例6: train_MLT
def train_MLT(self, tagged_train_data, untagged_training_data):
"""
Builds a most likely tag tagger from the given tagged training data as WORDS
:param train_data:
:return: model
"""
# find the set of words
words = set()
for sent in untagged_training_data:
for word in sent:
words.add(word)
# Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........}
mlt_dict = dict()
# Initialize keys and values to it
for word in words:
mlt_dict[word] = dict()
# Compute the freq dist of tagged words
tagged_words_fdist = FreqDist(tagged_train_data)
for tagged_word, count in tagged_words_fdist.items():
(mlt_dict[tagged_word[0]])[tagged_word] = count
# Update the dict to contain the most likely tag for each word
#for word, inside_dict in mlt_dict.items():
# max_val = max(inside_dict.values())
# inside_dict =
print("Training is done!")
return mlt_dict
开发者ID:GaddipatiAsish,项目名称:Natural-Language-Processing,代码行数:28,代码来源:Q6_Part1.py
示例7: most_frequent_words
def most_frequent_words(path,top):
root_path = "./"+path;
writers = os.listdir(root_path);
word_set = set();
for writer in writers:
if writer.find(".") != -1:
continue;
inside_folder = root_path + "//" +writer;
files = os.listdir(inside_folder);
formated_text = "";
for file in files:
file_path = root_path + "//" +writer+"//"+ file;
fw = open(file_path,"r",encoding="utf8");
article = fw.read();
#print(article);
formated_text+=" ";
formated_text += formatText(article);
fw.close();
words = get_bigrams(formated_text);
fdist = FreqDist(w for w in words if
len(w) > 1 and isEnglish(w) == False and w != "``");
keys = fdist.most_common(top);
for key in keys:
#print(str(key[0]) + " , " + str(key[1]) + "\n");
word_set.add(key[0]);
print(word_set);
fw = open("./Features/Bigrams.csv","w",encoding="utf8");
for word in word_set:
fw.write(word);
fw.write("\n");
fw.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:32,代码来源:most_frequent_bigrams.py
示例8: classify
def classify(self, feats):
counts = FreqDist()
for classifier in self._classifiers:
counts.inc(classifier.classify(feats))
return counts.max()
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:7,代码来源:classification.py
示例9: scores
def scores(self, docId):
"""
Return the score from the given document to every other
document in the index. Documents not listed are assumed
to have no similarity detected by shared terms.
:param docId: ID of doc to compare other docs to.
:returns: A list of tuples of (document ID, similarity score).
Larger scores are better.
"""
if not self._idf:
self._computeIdfs()
# Track the scores
#
docScores = FreqDist()
for termid, freq in self.termFrequencies[docId].iteritems():
# Find the frequency with which this term appears in other documents.
#
inverseDocumentFrequency = self._idf[termid]
for otherDocId in self.termsToDocuments[termid]:
if otherDocId == docId:
# Skip this document
continue
# Find the term frequency of the term in the other document.
#
otherFreq = self.termFrequencies[docId][termid]
# Score proportional to product of frequencies times the inverse of
# the document frequency.
#
docScores.inc(otherDocId, freq * otherFreq * inverseDocumentFrequency)
return docScores
开发者ID:timdestan,项目名称:quiz-bowl-entity-resolution,代码行数:32,代码来源:invertedindex.py
示例10: word_tag_model
def word_tag_model(words, tagged_words, limit=200):
fd = FreqDist(words)
cfd = ConditionalFreqDist(tagged_words)
most_freq = (word for word, count in fd.most_common(limit))
return dict((word, cfd[word].max()) for word in most_freq)
开发者ID:byam,项目名称:predictEPL,代码行数:7,代码来源:tag_util.py
示例11: get_term_freq_dict
def get_term_freq_dict(data):
# Change it to lower case
lower_data = data.lower()
# Tokenize it
tokens = word_tokenize(lower_data)
freq_dist = FreqDist(tokens)
# Lemmatize it
word_freq = {}
for term in freq_dist.keys():
lemmatize_term = wordnet.lemmatize(term)
val = freq_dist.get(term)
# If it exist in word_freq, add value
if lemmatize_term in word_freq:
freq = word_freq[lemmatize_term]
word_freq[lemmatize_term] = freq + val
# Else, assign value
else:
word_freq[lemmatize_term] = val
return word_freq
开发者ID:Maverickwarrior,项目名称:Search-Engine,代码行数:26,代码来源:tokenize_docs.py
示例12: choose_tag
def choose_tag(self, tokens, index, history):
tags = FreqDist()
for tagger in self._taggers:
tags.inc(tagger.choose_tag(tokens, index, history))
return tags.max()
开发者ID:ANB2,项目名称:nltk-trainer,代码行数:7,代码来源:taggers.py
示例13: create_word_bigram_scores
def create_word_bigram_scores(posWords, negWords):
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in neg:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:29,代码来源:process.py
示例14: create_word_scores
def create_word_scores(posWords, negWords):
file_scores = file("cn_sample_data/scores.txt", "w")
#迭代,将多个序列合并
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in negWords:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
for key in word_scores:
file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
file_scores.close()
return word_scores
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:25,代码来源:process.py
示例15: GetHighInformationWordsChi
def GetHighInformationWordsChi(num_bestwords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd[word.lower()] +=1
label_word_fd['pos'][word.lower()] +=1
for word in movie_reviews.words(categories=['neg']):
word_fd[word.lower()] +=1
label_word_fd['neg'][word.lower()] +=1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
bestwords = set([w for w, s in best])
return bestwords
开发者ID:ai2010,项目名称:machine_learning_for_the_web,代码行数:28,代码来源:views.py
示例16: buildCategoryDictionary
def buildCategoryDictionary(category):
tweetList = twitter_fetch.get_tweets_text(classn=category)
freq = FreqDist()
for tweet in tweetList:
freq.update(word for word in tokenizeTweet(tweet))
saveDictionaryToFile(freq, category + categoryDictFilePath)
return freq
开发者ID:elms1990,项目名称:twitter-ml,代码行数:7,代码来源:textMining.py
示例17: mostCommWords
def mostCommWords(self, tag, pos_tag_pattern):
"""
This is a help method for mostCommNouns and mostCommVerbs.
Argument: tag -- a hashtag that we want to compute the most commonly hashtag with
pos_tag_pattern
-- the regular expression that used to match the POS tags
return: a list of the top 20 nouns associated with the input hashtag
"""
words={}
topTwenty=[]
j = 0
for line in self.lines:
hasTag = False
for t in self.tokenizer(line, hashtag_pattern):
if t == tag:
hasTag = True
break
if hasTag:
counts = FreqDist()
tokens = self.tokenizer(line, word_pattern)
pos = nltk.pos_tag(tokens)
for p in pos:
if re.match(pos_tag_pattern,p[1]):
counts.inc(p[0])
for n in counts.keys():
if words.has_key(n):
words[n] = words[n]+counts[n]
else:
words[n] = counts[n]
words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True)
for i in range(0,20):
topTwenty.append(words_sorted_by_counts[i][0])
return topTwenty
开发者ID:52nlp,项目名称:nlp,代码行数:34,代码来源:tokenizer.py
示例18: train
def train(labeled_featuresets, estimator=ELEProbDist):
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
for featureset, label in labeled_featuresets:
label_freqdist.inc(label)
for fname, fval in featureset.items():
feature_freqdist[label, fname].inc(fval)
feature_values[fname].add(fval)
fnames.add(fname)
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
feature_freqdist[label, fname].inc(None, num_samples-count)
feature_values[fname].add(None)
label_probdist = estimator(label_freqdist)
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label,fname] = probdist
return NaiveBayesClassifier(label_probdist, feature_probdist)
开发者ID:aemperor,项目名称:twitter_sentiment,代码行数:28,代码来源:main.py
示例19: similar
def similar(self, word, num=20):
"""
Distributional similarity: find other words which appear in the
same contexts as the specified word; list most similar words first.
:param word: The word used to seed the similarity search
:type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.similar_words()
"""
if '_word_context_index' not in self.__dict__:
print('Building word-context index...')
self._word_context_index = ContextIndex(self.tokens,
filter=lambda x:x.isalpha(),
key=lambda s:s.lower())
# words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w]
if c in contexts and not w == word)
words = islice(fd.keys(), num)
print(tokenwrap(words))
else:
print("No matches")
开发者ID:damorelse,项目名称:MachineTranslation,代码行数:29,代码来源:text.py
示例20: BigramAll
def BigramAll():
to_save_folder = "./#Bigram[.]/"
folder_list = os.listdir("./");
for folder in folder_list:
if folder.find(".") != -1 :
continue;
folder_name = "./" + folder + "/"
data_path = folder_name+"data.doc";
fw = open(data_path,"r",encoding="utf8");
text = fw.read();
words = word_tokenize(text);
big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
myBig = []
for bi in big:
myBig.append(bi[0]+" "+bi[1]);
fdist = FreqDist(str(w) for w in myBig);
keys = fdist.most_common(len(fdist.keys()))
dataFreq = "";
for key in keys:
dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";
make_sure_path_exists(to_save_folder+folder)
writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
writer.write(dataFreq);
fw.close();
writer.close();
开发者ID:olee12,项目名称:Stylogenetics,代码行数:29,代码来源:MakeNormalData.py
注:本文中的nltk.probability.FreqDist类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论