本文整理汇总了Python中nltk.probability.ConditionalFreqDist类的典型用法代码示例。如果您正苦于以下问题:Python ConditionalFreqDist类的具体用法?Python ConditionalFreqDist怎么用?Python ConditionalFreqDist使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ConditionalFreqDist类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: cDist
def cDist(self, params):
"""return conditional freq distribution (based on part of speech) using filtered_words from loadData"""
president = params["president"]
speech = params["speech"]
if self.president == "All presidents":
pipeline = [{"$match": {"type": speech}}, {"$project": {"tags": "$filtered_speech_tags"}}]
else:
pipeline = [
{"$match": {"name": president, "type": speech}},
{"$project": {"tags": "$filtered_speech_tags"}},
]
tags = []
for i in self.col.aggregate(pipeline):
tags.extend(i["tags"])
cfdist = ConditionalFreqDist() # conditioned on pos_tag
for word, tag in tags:
condition = tag # specify condition to group frequencies by
cfdist[condition][word] += 1
VB = MLEProbDist(cfdist.get("VBP"))
NN = MLEProbDist(cfdist.get("NN"))
JJ = MLEProbDist(cfdist.get("JJ"))
return VB, NN, JJ # return verbs, nouns, adjectives
开发者ID:cgerson,项目名称:presidential-haikus,代码行数:28,代码来源:pres_words_spyre.py
示例2: _setSelectedPOSTags
def _setSelectedPOSTags(self):
buff = self._loadData('selective_pos.bin')
if buff:
self.selective_pos = buff
return
#First get all (word, tag) in corpuses
sentences = brown.tagged_sents(simplify_tags=True)
self.selected_tags = ["ADJ","ADV", "CNJ"]
self.selective_pos = ConditionalFreqDist()
temp_dist = ConditionalFreqDist()
for sentence in sentences:
for (word, tag) in sentence:
if tag in self.selected_tags:
temp_dist[tag].inc(str(word).lower())
#Now, get the words with frequency > 10
for category in temp_dist.conditions():
fredist = temp_dist[category]
for key in fredist.keys():
if fredist[key] > 4:
self.selective_pos[category].inc(key)
self._saveData('selective_pos.bin',self.selective_pos)
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:26,代码来源:opinionminer.py
示例3: readFormatedData
def readFormatedData(formatedData):
#unigramFd = FreqDist()
#bigramFd = FreqDist()
cBigramFd1 = ConditionalFreqDist()
cBigramFd2 = ConditionalFreqDist()
#dict1 = Set([])
#dict2 = Set([])
for tuple in formatedData:
words = tuple[0].split(' ')
count = int(tuple[1])
#unigramFd.inc(words[0])
#unigramFd.inc(words[1])
#bigramFd.inc((words[0], words[1]), count)
word2 = words[1]
if count < 5:
word2 = "unknown"
cBigramFd1[words[0]].inc(word2, count)
#if words[0] not in dict1:
# dict1.add(words[0])
#if words[1] not in dict2:
# dict2.add(words[1])
for w1 in cBigramFd1.conditions():
bigram_w1 = cBigramFd1[w1]
for w2 in bigram_w1.samples():
cBigramFd2[w2].inc(w1, bigram_w1[w2])
return cBigramFd1, cBigramFd2#, dict1, dict2
开发者ID:szha,项目名称:surprise-models,代码行数:26,代码来源:aggregatePickMax.py
示例4: high_information_words
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
"""
To eliminate low information feature words for set of words for EFFICIENCY
:param labeled_words: list of 2 tuples [(label, words)]
label -> is a classification label (pos / neg)
words -> is a list of words that occur under that label
:param score_fn: a scoring function to measure how informative that word is
:param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD
:return: a set of high informative words
"""
print "Counting Word Frequencies"
word_fq = FreqDist()
labeled_word_fq = ConditionalFreqDist()
for label, words in labeled_words:
for word in words:
word_fq[word] += 1
labeled_word_fq[label][word] += 1
n_xx = labeled_word_fq.N()
high_info_words = set()
for label in labeled_word_fq.conditions():
n_xi = labeled_word_fq[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in labeled_word_fq[label].iteritems():
n_ix = word_fq[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
开发者ID:Saher-,项目名称:SATC,代码行数:34,代码来源:Sys_Params.py
示例5: get_high_information_words
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
labels = lwords.keys()
labelled_words = [(l, lwords[l]) for l in labels]
word_freq_dist = FreqDist()
label_word_freq_dist = ConditionalFreqDist()
for label, dwords in labelled_words:
for words in dwords:
for word in words:
word_freq_dist[word] += 1
label_word_freq_dist[label][word] += 1
n_words_total = label_word_freq_dist.N()
high_info_words = set()
for label in label_word_freq_dist.conditions():
n_words_label = label_word_freq_dist[label].N()
word_scores = defaultdict(int)
for word, word_freq_label in label_word_freq_dist[label].items():
word_freq = word_freq_dist[word]
score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
开发者ID:fruser,项目名称:review-analyzer,代码行数:27,代码来源:text_utils.py
示例6: high_information_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd.inc(word)
label_word_fd[label].inc(word)
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].iteritems():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:25,代码来源:featx.py
示例7: Ae_kappa
def Ae_kappa(self, cA, cB):
Ae = 0.0
nitems = float(len(self.I))
label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
for k in label_freqs.conditions():
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
return Ae
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:7,代码来源:agreement.py
示例8: _train
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
Initialize this ContextTagger's ``_context_to_tag`` table
based on the given training data. In particular, for each
context ``c`` in the training data, set
``_context_to_tag[c]`` to the most frequent tag for that
context. However, exclude any contexts that are already
tagged perfectly by the backoff tagger(s).
The old value of ``self._context_to_tag`` (if any) is discarded.
:param tagged_corpus: A tagged corpus. Each item should be
a list of (word, tag tuples.
:param cutoff: If the most likely tag for a context occurs
fewer than cutoff times, then exclude it from the
context-to-tag table for the new tagger.
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None:
continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(
tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
best_tag = fd[context].max()
hits = fd[context][best_tag]
if hits > cutoff:
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=' ')
print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning))
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:59,代码来源:sequential.py
示例9: __init__
def __init__(self, r, name, cond_samples=None):
self._r = r
self._name = name
ConditionalFreqDist.__init__(self, cond_samples)
# initialize self._fdists for all matching keys
for key in self._r.keys(encode_key('%s:*' % name)):
condition = key.split(':')[1]
self[condition] # calls self.__getitem__(condition)
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:8,代码来源:redisprob.py
示例10: __init__
def __init__(self, r, name, cond_samples=None):
self._r = r
self._name = name
ConditionalFreqDist.__init__(self, cond_samples)
for key in self._r.keys(encode_key('%s:*' % name)):
condition = key.split(b':')[1].decode()
self[condition] # calls self.__getitem__(condition)
开发者ID:ShunyuanZ,项目名称:nltk3-cookbook,代码行数:8,代码来源:redisprob.py
示例11: words_by_followers
def words_by_followers(category):
"""Given a category from the brown corpus, lowercases everything,
and returns a frequency distribution where the keys are words
and the counts are the number of different contexts that each word can appear in."""
bigrams = brown_bigrams(category)
cfdist = ConditionalFreqDist((bigram[1], bigram[0]) for bigram in bigrams)
fdist = FreqDist()
for context in cfdist.keys():
fdist[context] = len(cfdist[context])
return fdist
开发者ID:slee17,项目名称:NLP,代码行数:10,代码来源:languageModel.py
示例12: _train
def _train(self, tagged_corpus, cutoff=0, verbose=False):
token_count = hit_count = 0
useful_contexts = set()
fd = ConditionalFreqDist()
tag_prob = FreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
tag_prob.inc(tag)
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context].inc(tag)
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context,
# calculate the entropy. Only include contexts that
# lower then `cutoff` .
total_tags = float(sum(tag_prob.values()))
tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
useful_contexts_after_filter = useful_contexts.copy()
most_high = FreqDist()
for context in useful_contexts:
dd = fd[context]
# total_tags = float(sum(dd.values()))
# tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
h = self.H(dd.keys(),tags_probs)
if h > cutoff:
useful_contexts_after_filter.remove(context)
continue
most_high[context] = h
print most_high.keys()
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is.
for context in useful_contexts_after_filter:
best_tag = fd[context].max()
hits = fd[context][best_tag]
self._context_to_tag[context] = best_tag
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
开发者ID:atiassa,项目名称:recommend-2011,代码行数:49,代码来源:q2.py
示例13: __init__
def __init__(self, unk=None, Trained=False, N=1000, C=False):
'''
Construct a TnT statistical tagger. Tagger must be trained
before being used to tag input.
:param unk: instance of a POS tagger, conforms to TaggerI
:type unk:(TaggerI)
:param Trained: Indication that the POS tagger is trained or not
:type Trained: boolean
:param N: Beam search degree (see above)
:type N:(int)
:param C: Capitalization flag
:type C: boolean
Initializer, creates frequency distributions to be used
for tagging
_lx values represent the portion of the tri/bi/uni taggers
to be used to calculate the probability
N value is the number of possible solutions to maintain
while tagging. A good value for this is 1000
C is a boolean value which specifies to use or
not use the Capitalization of the word as additional
information for tagging.
NOTE: using capitalization may not increase the accuracy
of the tagger
'''
self._uni = FreqDist()
self._bi = ConditionalFreqDist()
self._tri = ConditionalFreqDist()
self._wd = ConditionalFreqDist()
self._eos = ConditionalFreqDist()
self._l1 = 0.0
self._l2 = 0.0
self._l3 = 0.0
self._N = N
self._C = C
self._T = Trained
self._unk = unk
# statistical tools (ignore or delete me)
self.unknown = 0
self.known = 0
开发者ID:Arttii,项目名称:TextBlob,代码行数:47,代码来源:tnt.py
示例14: validate_pcfg_generate
def validate_pcfg_generate(grammar):
pd = makeLhrProbDict(grammar)
productions = []
cfd = ConditionalFreqDist()
for i in np.arange(1000):
tree = pcfg_generate(grammar)
productions += tree.productions()
for p in productions:
cfd[p.lhs()].inc(p.rhs())
for c in cfd.conditions():
p = MLEProbDist(cfd[c])
q = pd[c]
div = KL_Divergence(p, q)
print "KL_Divergence for %s = %f" % (c, div)
开发者ID:haozhuoran1991,项目名称:recommend-2011,代码行数:17,代码来源:q2_1.py
示例15: __init__
def __init__(self, load_from_disk=True):
self._corpus = reuters.words()
self._unigram_fd = FreqDist()
self._bigram_cfd = ConditionalFreqDist()
self._trigram_cfd = ConditionalFreqDist()
self._quadgram_cfd = ConditionalFreqDist()
self._unigram_pd = None
self._bigram_cpd = None
self._trigram_cpd = None
self._quadgram_cpd = None
if load_from_disk:
self._load_models()
else:
self._train()
开发者ID:drewatk,项目名称:textPredictor,代码行数:17,代码来源:predictor.py
示例16: _train
def _train(self, tagged_corpus, cutoff=0, verbose=False):
"""
"""
token_count = hit_count = 0
# A context is considered 'useful' if it's not already tagged
# perfectly by the backoff tagger.
useful_contexts = set()
# Count how many times each tag occurs in each context.
fd = ConditionalFreqDist()
for sentence in tagged_corpus:
tokens, tags = zip(*sentence)
for index, (token, tag) in enumerate(sentence):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
if context is None: continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
tag != self.backoff.tag_one(tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
# out what the most likely tag is. Only include contexts that
# we've seen at least `cutoff` times.
for context in useful_contexts:
#best_tag = fd[context].max()
for (tag, hits) in fd[context].items():
if hits > cutoff:
self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {})
self._contexts_to_tags[context][tag] = hits
hit_count += hits
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
backoff = 100 - (hit_count * 100.0)/ token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print "[Trained Unigram tagger:",
print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning)
开发者ID:0623forbidden,项目名称:nltk4russian,代码行数:43,代码来源:tagger.py
示例17: sum_category_word_scores
def sum_category_word_scores(categorized_words, score_fn):
word_fd = FreqDist()
category_word_fd = ConditionalFreqDist()
for category, words in categorized_words:
for word in words:
word_fd.inc(word)
category_word_fd[category].inc(word)
scores = collections.defaultdict(int)
n_xx = category_word_fd.N()
for category in category_word_fd.conditions():
n_xi = category_word_fd[category].N()
for word, n_ii in iteritems(category_word_fd[category]):
n_ix = word_fd[word]
scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
return scores
开发者ID:Herka,项目名称:nltk-trainer,代码行数:20,代码来源:scoring.py
示例18: __init__
def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
assert(isinstance(pad_left, bool))
assert(isinstance(pad_right, bool))
self._n = n
self._lpad = ('',) * (n - 1) if pad_left else ()
self._rpad = ('',) * (n - 1) if pad_right else ()
if estimator is None:
estimator = _estimator
self._cfd = ConditionalFreqDist()
self._ngrams = set()
# If given a list of strings instead of a list of lists, create enclosing list
if (train is not None) and isinstance(train[0], basestring):
train = [train]
for sent in train:
for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
self._cfd[context].inc(token)
if not estimator_args and not estimator_kwargs:
self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
else:
self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)
# recursively construct the lower-order models
self._backoff = None
if n > 1:
self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
if self._backoff is not None:
self._backoff_alphas = dict()
# For each condition (or context)
for ctxt in self._cfd.conditions():
pd = self._model[ctxt] # prob dist for this context
backoff_ctxt = ctxt[1:]
backoff_total_pr = 0
total_observed_pr = 0
for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED
backoff_total_pr += self._backoff.prob(word,backoff_ctxt)
total_observed_pr += pd.prob(word)
assert total_observed_pr <= 1 and total_observed_pr > 0
assert backoff_total_pr <= 1 and backoff_total_pr > 0
alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
self._backoff_alphas[ctxt] = alpha_ctxt
开发者ID:yezhang1989,项目名称:Time-Series-Analysis,代码行数:54,代码来源:MyNgram.py
示例19: significantWords
def significantWords(untagged_docs, min_chisq=5, ratio=0.75):
"""
Use chisq test of bigram contingency table to measure
the association of token with its sentiment
Parameters
----------
untagged_docs: list of tuples (words, tag)
min_chisq: lower bound of significant
ratio: pos/neg ratio, used to determine the sentiment of a word
Returns
-------
significant_words: a 3-key-dict of words set
"""
significant_words = collections.defaultdict(set)
freq_dist = FreqDist()
label_freq_dist = ConditionalFreqDist()
stopping_words = set(nltk.corpus.stopwords.words('english'))
for tokens, label in untagged_docs:
for token in tokens:
if token.isalpha() and not (token in stopping_words):
freq_dist.inc(token)
label_freq_dist[label].inc(token)
n_xx = label_freq_dist.N()
#pdb.set_trace()
for label in label_freq_dist.conditions():
for word, n_ii in label_freq_dist[label].iteritems():
n_xi = label_freq_dist[label].N()
n_ix = freq_dist[word]
n_oi = n_xi-n_ii
n_io = n_ix-n_ii
n_oo = n_xx-n_oi-n_io-n_ii
chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\
/((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi))
if chisq > min_chisq and n_ii>10:
significant_words['total'] |= set([word])
if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1:
significant_words[label] |= set([word])
return significant_words
开发者ID:Applied-data-science-HW8,项目名称:Homework_08,代码行数:41,代码来源:tagger.py
示例20: sum_category_word_scores
def sum_category_word_scores(categorized_words, score_fn):
# get word freq
word_fd = FreqDist()
# get conditional freq Dist
category_word_fd = ConditionalFreqDist()
# according to catagory
for category, words in categorized_words:
for word in words:
word_fd.inc(word)
category_word_fd[category].inc(word)
scores = collections.defaultdict(int)
n_xx = category_word_fd.N()
for category in category_word_fd.conditions():
n_xi = category_word_fd[category].N()
for word, n_ii in category_word_fd[category].iteritems():
n_ix = word_fd[word]
scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx)
# return the scores
return scores
开发者ID:CloudFlix,项目名称:Project_CloudFlix,代码行数:22,代码来源:model_trainer.py
注:本文中的nltk.probability.ConditionalFreqDist类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论