本文整理汇总了Python中nltk.FreqDist类的典型用法代码示例。如果您正苦于以下问题:Python FreqDist类的具体用法?Python FreqDist怎么用?Python FreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FreqDist类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: top_words_from_corpus
def top_words_from_corpus(self, num_words, test_name):
corpus_tokens = []
for i in self.corpus_vars["corpus_member_ids"]:
title = 'document_' + str(i)
doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
corpus_tokens += doc_tokens
top_words = []
fdist_corpus = FreqDist(corpus_tokens)
fdist_list = fdist_corpus.items()
if test_name == "Function Word PCA":
function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
tagged_top = nltk.pos_tag(top_words)
for j,k in tagged_top:
if k not in function_pos:
top_words.remove(j)
if len(top_words) == num_words:
break
elif test_name == "Burrows's Delta":
for i in fdist_list:
top_words.append(i[0])
if len(top_words) == num_words:
break
return top_words
开发者ID:mjlavin80,项目名称:py_style,代码行数:26,代码来源:py_styleModel.py
示例2: get_hosts
def get_hosts(year):
'''Hosts is a list of one or more strings. Do NOT change the name
of this function or what it returns.'''
# Your code here
file_name = 'gg%s.json' % year
with open(file_name, 'r') as data:
db = json.load(data)
hosts = []
pairs = []
for f in db:
e = f['text']
if 'and' in e.lower():
for proper in strip_proper_pairs(normalize_str(e).split()):
pair = proper.split('and')
if len(pair) == 2:
if pair[0] != ' ' and pair[1] != ' ':
pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
pairs_freq = FreqDist(pairs)
if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
hosts.append(pairs_freq.most_common(10)[1][0][0])
hosts.append(pairs_freq.most_common(10)[1][0][1])
else:
hosts.append(pairs_freq.most_common(10)[0][0][0])
hosts.append(pairs_freq.most_common(10)[0][0][1])
return hosts
开发者ID:GregoryElliott,项目名称:TGMA_NLP_Project,代码行数:25,代码来源:gg_api.py
示例3: make_cutOff
def make_cutOff(flatList, bottomCutOff, topCutOff):
'''
INPUT:
flatList is a 1-d list of all tokens in set of tweets and both bottom and
topCutOff are intergers
OUTPUT:
newVocab = a 1-d list of all tokens we want to keep
thrownOut = a 1-d list of all tokens to throw out
'''
fd = FreqDist(flatList)
newVocab = []
thrownOut = []
for item in fd.items()[:topCutOff]:
# append most common words
thrownOut.append(item)
for item in fd.items()[topCutOff:]:
if item[1] > bottomCutOff:
# append good words
newVocab.append(item[0])
else:
# append uncommon words
thrownOut.append(item)
print 'Cutoffs made...'
return newVocab, thrownOut
开发者ID:JRMeyer,项目名称:twitter,代码行数:27,代码来源:twitter_lda.py
示例4: main
def main():
keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
"Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
"hacker", "exploit code", "Defense", "Intelligence", "Agency"]
file_name = "tweets_output.txt"
pickle_words_file = "words.pickle"
pickle_words(file_name, pickle_words_file, keyword_list)
pickle_tweets_file = "tweets.pickle"
pickle_tweets(file_name, pickle_tweets_file)
words = load(open("words.pickle"))
tweets = load(open("tweets.pickle"))
freq_dist = FreqDist(words)
print tweets
print("===")
print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
print("===")
print("Number of words within the twitter search space: ")
print(len(words))
print("Number of unique words within twitter search space: ")
print(len(set(words)))
print("Lexical Diversity of unique words within twitter search space: ")
print(lexical_diversity(words))
print("===")
print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
print("===")
print("Top 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[:50])
print("===")
print("Bottom 50 Frequent Words within the Twitter Search Space: ")
print(freq_dist.keys()[-50:])
print("===")
开发者ID:0day1day,项目名称:OSINT,代码行数:31,代码来源:nltk_tweet_analysis.py
示例5: get_word_features
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
开发者ID:toshi09,项目名称:UserProfilingInSocialMedia,代码行数:7,代码来源:naive_bayes_nltk.py
示例6: __init__
def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0,
max_tables = 50000, sanity_check=False, initialize=False,
report_filename="topic_history.txt"):
self.max_tables = max_tables
self._alphabet = FreqDist()
# store all words seen in a list so they are associated with a unique ID.
self.initialize_index()
self._words = FreqDist()
self.alpha_topic = alpha_topic
self.alpha_word = alpha_word
self._num_updates = 0
self._report = None
if report_filename:
self._report = open(report_filename, 'w')
self.num_topics = num_topics
self._topics = [FreqDist() for x in xrange(num_topics)]
# the sanity_check flag is for testing only.
if initialize and sanity_check == True:
self.deterministic_seed()
elif initialize:
self.initialize_topics()
开发者ID:Mondego,项目名称:pyreco,代码行数:28,代码来源:allPythonContent.py
示例7: __init__
class Index:
"""
The Index class stores an index for a document.
"""
def __init__(self):
self._freq_dist = None
self._document = None
def index(self, document):
self._document = document
if self._freq_dist == None:
self._freq_dist = FreqDist()
for term in self.terms():
self._freq_dist.inc(term)
def reset(self):
"Reset the index"
self._freq_dist = None
def freq_dist(self):
if self._freq_dist == None:
self.index()
return self._freq_dist
# return the number of times a term appears in this document
def freq(self, term):
if not self._freq_dist:
self.index()
return self._freq_dist[term]
def tf(self, term):
if not self._freq_dist:
self.index()
return float(self._freq_dist[term]) / float(self._freq_dist.N())
开发者ID:jgerrish,项目名称:nltk_ext,代码行数:34,代码来源:index.py
示例8: create_word_freq
def create_word_freq(db):
db = getattr(db, "Posts")
#client.command("CREATE CLASS concepted EXTENDS E")
client.command("DELETE EDGE concepted")
#client.command('create property frequency.freq string')
#client.command("DELETE VERTEX frequency")
data = db.find().batch_size(50)
concept = client.command("SELECT name FROM concept")
c = [c.name for c in concept]
for d in data:
if not 'Body' in d:
display= ''
else:
display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
tokens = nltk.word_tokenize(display)
fdist=FreqDist(tokens)
i = fdist.most_common()
for k in i:
if k[0].lower() in c:
try:
client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
except:
continue
开发者ID:Wehrlem,项目名称:flaskWebIn,代码行数:25,代码来源:stackToOrient.py
示例9: process
def process(f, return_tokens=True, return_freqdist=True):
"""
Function to process deals data.
Splits text into sentences. FreqDist is incremented from tokenization.
Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
Deals are also about domain names. Not intending to split it up
:rtype : FreqDist, list() of str
:param f: Input file with a deal per line
"""
fd = FreqDist()
tokens = []
fh = open(f, 'r')
sentences = [line.strip() for line in fh.readlines()]
for line in sentences:
t = []
for word in PunktWordTokenizer().tokenize(line.lower()):
if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
if return_tokens:
t.append(word)
if return_freqdist:
fd.inc(word)
tokens.append(t)
fh.close()
return fd, sentences, tokens
开发者ID:ypandit,项目名称:exercises,代码行数:25,代码来源:task1.py
示例10: process_tweets
def process_tweets (hashtag,addl_stops=[]):
count=0
good_count=0
words_to_plot=[]
#Iterate through all chunked files with relevant hashtag
for fname in os.listdir(os.getcwd()):
if fname.startswith(hashtag):
with open(fname,'r') as data_file:
data=data_file.read()
# Parse raw string since json.load() approach wasn't working
data=data.split("\n\x00,")
for tweet in data:
count+=1
# Tweets have a well-defined structure, so we can parse them
# manually (even though the JSON approach would be cleaner)
text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
# Skip tweets that contain Unicode
if text.find('\u')>=0:
continue
else:
good_count+=1
# Tokenize and count word frequency, ignoring case
words = word_tokenize(text)
clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
words_to_plot=words_to_plot+clean_words
#Create frequency histogram of 50 most common words and print summary of activity
fdist=FreqDist(words_to_plot)
fdist.plot(50)
print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
return words_to_plot
开发者ID:lanorzi,项目名称:MIDS-W205_A2-1,代码行数:33,代码来源:create_histograms.py
示例11: featureset
def featureset(sample):
comment, label = sample
features = {}
# tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
words = map(lambda statement: map(lambda (w,t):w, statement), comment)
words = sum(words, [])
# tags = sum(tags, [])
size_= sum([len(word) for word in words])
features['stmt_len'] = len(words)/float(len(comment))
features['word_len'] = size_/float(len(words))
features['size'] = size_
# tags_dist = FreqDist(sum(tags, []))
# for tag in TAGS:
# features[tag] = tags_dist.get(tag, 0)
dist = FreqDist([word.lower() for word in words])
# num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
# features['prob_stop_words'] = num_stop_words/len(words)
for word in EN_STOPWORDS:
features[word] = dist.get(word, 0)/float(len(words))
features['alwayson'] = 1.0
for language in LANGUAGES:
for i in range(1,n+1):
word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
features['w_sim_%d_%s' % (i, language)] = word_sim
features['t_sim_%d_%s' % (i, language)] = tag_sim
features['c_sim_%d_%s' % (i, language)] = char_sim
# features['s_sim_%d_%s' % (i, language)] = w_s_sim
return (features, label)
开发者ID:aboSamoor,项目名称:NLP,代码行数:28,代码来源:rami_learning.py
示例12: posAnalysis
def posAnalysis(collection):
reviews = collection.find(timeout=False)
__reportProgress.counter = 0
skip = 1
for rev in reviews:
if skip%200 == 0:
print 'skip'+str(skip)
__reportProgress()
if rev.has_key('tags'):
skip += 1
if rev['tags'].has_key('NN'):
continue
sents = sent_tokenize(rev['text'])
tokens = [word for sent in sents for word in word_tokenize(sent)]
pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
tag_fd = FreqDist(tag for (word, tag) in pos)
tags = dict()
for (key,value) in tag_fd.items():
k = key.replace('$','S')
out = key.translate(string.maketrans("",""), string.punctuation)
if len(out)>0:
tags[k] = value
collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})
开发者ID:ecsark,项目名称:Yelp-Recruiting,代码行数:28,代码来源:trueRating.py
示例13: getTopNFreqWords
def getTopNFreqWords(textArr,N):
fdist = FreqDist(textArr)
topWordsWithFreq = fdist.most_common(N)
topWords=[]
for word in topWordsWithFreq:
topWords.append(word[0])
return topWords
开发者ID:akhilarora,项目名称:intelliad,代码行数:7,代码来源:NaiveBayesClassifierBulk-USER.py
示例14: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py
示例15: cleaner
def cleaner(filename):
textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
text = []
all_dates = []
complete_text = []
words_list = []
nodes = []
for line in textfile:
datetime,chat = line.split('-')
date, time = datetime.split(',')
loc = chat.find(':')
#if len(chat.split(':'))==3:
# print chat
user,text = chat[:loc],chat[loc+2:]
text = text.replace("\n",'')
words = text.split(' ')
for i in words:
words_list.append(i)
complete_text.append(text)
nodes.append(user)
all_dates.append(date)
#print set(nodes)
#print set(all_dates)
fdist = FreqDist(words_list)
f1 = fdist.most_common(100)
create_csv('wordcloud.csv',f1)
textfile.close()
开发者ID:sehgalvibhor,项目名称:Whatsapp-ening,代码行数:29,代码来源:flaskr.py
示例16: bag_of_words
def bag_of_words(data, label_codebook, feature_codebook, theta):
""""""
word_dict = Alphabet()
stopset = set(stopwords.words('english'))
for key, value in data.items():
label_codebook.add(key)
for doc in value:
doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
for word in doc_tokens:
if word not in stopset:
word_dict.add(word)
all_words = word_dict._label_to_index.keys()
fdict = FreqDist([w for w in all_words])
word_feature = fdict.keys()[theta:]
for word in all_words:
if word in word_feature:
feature_codebook.add(word)
instance_list = {}
for label, document_list in data.items():
instance_list[label] = []
for document in document_list:
vector = np.zeros(feature_codebook.size())
tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
indice = 0
for word in tokens:
if feature_codebook.has_label(word):
indice = feature_codebook.get_index(word)
vector[indice] = 1.
instance_list[label].append(vector)
return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:33,代码来源:naive_bayes.py
示例17: palavrasChaves
def palavrasChaves(self):
# fun��o da NLTK que retorna as stopwords na lingua inglesa
stopE = stopwords.words('english')
# fun��o da NLTK que retorna as stopwords na lingua portuguesa
stop = stopwords.words('portuguese')
stopS = stopwords.words('spanish')
palavrasChaves = []
textoArtigo = []
#retira pontua��es do texto e divide o texto em palavras
for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
#retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
if i not in stop:
#retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
if i not in stopE:
#ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
if i not in stopS:
if len(i) > 2:
textoArtigo.append(i)
# apresenta a frequencia de repeticoes das palavras no corpo do artigo
freq = FreqDist(textoArtigo)
# separa as quatro palavras mais frequentes
items = freq.items()[:4]
# coloca as palavras mais frequentes do texto na variavel palavrasChaves
for i in range(0,len(items)):
palavrasChaves.append(items[i][0])
return palavrasChaves
开发者ID:dienerpiske,项目名称:QSabe,代码行数:34,代码来源:models.py
示例18: transmit_vocabulary
def transmit_vocabulary(t_token, t_lang):
languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian',
'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
voc_stopwords = set()
if t_lang in languages:
voc_stopwords = set(stopwords.words(t_lang))
i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8')
lines = i_f.readlines()
all_tweets = []
corpus_size = 0
for line in lines:
row = line.split('\t')
words = word_tokenize(row[1])
all_tweets.extend([w.lower() for w in words])
corpus_size += 1
freq_distribution = FreqDist(all_tweets)
cats_vocabulary_elements = []
for word, frequency in freq_distribution.most_common(1000):
if word not in voc_stopwords:
cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']')
cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']'
print(cats_vocabulary)
result_data = {'token': t_token, 'result': cats_vocabulary}
json_data = json.dumps(result_data)
results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile')
results_request.add_header('Content-Type', 'application/json')
results_request.data = json_data.encode('utf-8')
urllib2.urlopen(results_request)
print('Transmitted vocabulary for token '+t_token)
os.remove('csv/' + t_token + '.csv')
开发者ID:CATS-Project,项目名称:CATS-TextMiningServices,代码行数:30,代码来源:run.py
示例19: find_names
def find_names(self):
"""creates a frequency distribution of the
most common names in the texts"""
names_list = LIST_OF_NAMES
name_tokens = [w for w in self.tokens if w in names_list]
fd = FreqDist(name_tokens)
return fd.most_common(50)
开发者ID:wludh,项目名称:frenchnewspapers,代码行数:7,代码来源:main.py
示例20: analyzeTitles
def analyzeTitles():
fulltitles = []
titles = []
with open('../top100clean.csv', 'rb') as bookfile:
reader = csv.reader(bookfile)
for row in reader:
if "..." in row[0]:
row[0] = " ".join(row[0].split(" ")[:-1])
words = nltk.word_tokenize(row[0])
for w in words:
if w.isalpha() and w.lower() not in ['the','a']:
titles.append(w.lower())
fulltitles.append(row[0])
titleset = nltk.Text(titles)
wordsintitle = [len(f.split(" ")) for f in fulltitles]
wit_fd = FreqDist(wordsintitle)
print "\nw.i.t.\tfreq"
print "--------------------"
for numwords, times in wit_fd.iteritems():
print str(numwords) + "\t" + str(times)
print "\n"
print "\nword\t\tfreq"
print "--------------------"
fd = FreqDist(titleset)
common_words = fd.most_common(25)
for k, v in common_words:
print str(k) + "\t\t" + str(v)
开发者ID:nelsonam,项目名称:booklytics,代码行数:29,代码来源:analyze_titles.py
注:本文中的nltk.FreqDist类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论