本文整理汇总了Python中nltk.util.tokenwrap函数的典型用法代码示例。如果您正苦于以下问题:Python tokenwrap函数的具体用法?Python tokenwrap怎么用?Python tokenwrap使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tokenwrap函数的18个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: demo_similar
def demo_similar(self, word, num=20):
"""
Distributional similarity: find other words which appear in the
same contexts as the specified word; list most similar words first.
@param word: The word used to seed the similarity search
@type word: C{str}
@param num: The number of words to generate (default=20)
@type num: C{int}
@seealso: L{ContextIndex.similar_words()}
"""
if '_word_context_index' not in self.__dict__:
print 'Building word-context index...'
self._word_context_index = nltk.text.ContextIndex(self.tokens,
filter=lambda x:x.isalpha(),
key=lambda s:s.lower())
# words = self._word_context_index.similar_words(word, num)
while 1:
word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):');
print "word='"+ word + "'"
if word == '0': break
word = word.decode('utf-8')
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w]
if c in contexts and not w == word)
words = fd.keys()[:num]
print tokenwrap(words)
else:
print "No matches"
开发者ID:dreampocketit,项目名称:bocard,代码行数:33,代码来源:NLTK_tools.py
示例2: demo_common_context
def demo_common_context(self, num=20):
"""
Find contexts where the specified words appear; list
most frequent common contexts first.
@seealso: L{ContextIndex.common_contexts()}
"""
if '_word_context_index' not in self.__dict__:
print 'Building word-context index...'
self._word_context_index = nltk.text.ContextIndex(self.tokens,
key=lambda s:s.lower())
while 1:
inp = raw_input('Enter two Chinese words such as "我 你"(type 0 to exit):');
print "inp='"+ inp+"'"
if inp == '0': break
inp = inp.decode('utf-8')
words = inp.split(u' ')
try:
fd = self._word_context_index.common_contexts(words, True)
if not fd:
print "No common contexts were found"
else:
ranked_contexts = fd.keys()[:num]
print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)
except ValueError, e:
print e
开发者ID:dreampocketit,项目名称:bocard,代码行数:25,代码来源:NLTK_tools.py
示例3: demo_collocations
def demo_collocations(self, num=40, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
@seealso: L{find_collocations}
@param num: The maximum number of collocations to print.
@type num: C{int}
@param window_size: The number of tokens spanned by a collocation (default=2)
@type window_size: C{int}
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
print "Building collocations list"
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
from nltk.collocations import BigramCollocationFinder
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
from nltk.metrics import f_measure, BigramAssocMeasures
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
print "List {0} collocations".format(num)
print tokenwrap(colloc_strings, separator=u'; ')
开发者ID:dreampocketit,项目名称:bocard,代码行数:26,代码来源:NLTK_tools.py
示例4: similar
def similar(self, word, num=20):
"""
Distributional similarity: find other words which appear in the
same contexts as the specified word; list most similar words first.
:param word: The word used to seed the similarity search
:type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.similar_words()
"""
if '_word_context_index' not in self.__dict__:
# print('Building word-context index...')
self._word_context_index = ContextIndex(
self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
)
# words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = Counter(
w
for w in wci.conditions()
for c in wci[w]
if c in contexts and not w == word
)
words = [w for w, _ in fd.most_common(num)]
print(tokenwrap(words))
else:
print("No matches")
开发者ID:prz3m,项目名称:kind2anki,代码行数:33,代码来源:text.py
示例5: collocations
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not (
'_collocations' in self.__dict__
and self._num == num
and self._window_size == window_size
):
self._num = num
self._window_size = window_size
# print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocMeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
开发者ID:prz3m,项目名称:kind2anki,代码行数:29,代码来源:text.py
示例6: findall
def findall(self, regexp):
"""
Find instances of the regular expression in the text.
The text is a list of tokens, and a regexp pattern to match
a single token must be surrounded by angle brackets. E.g.
>>> print('hack'); from nltk.book import text1, text5, text9
hack...
>>> text5.findall("<.*><.*><bro>")
you rule bro; telling you bro; u twizted bro
>>> text1.findall("<a>(<.*>)<man>")
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
>>> text9.findall("<th.*>{3,}")
thread through those; the thought that; that the thing; the thing
that; that that thing; through these than through; them that the;
through the thick; them that they; thought that the
:param regexp: A regular expression
:type regexp: str
"""
if "_token_searcher" not in self.__dict__:
self._token_searcher = TokenSearcher(self)
hits = self._token_searcher.findall(regexp)
hits = [' '.join(h) for h in hits]
print(tokenwrap(hits, "; "))
开发者ID:prz3m,项目名称:kind2anki,代码行数:30,代码来源:text.py
示例7: sandwich
def sandwich(cls, word):
"""
"""
ind = cls.corpora_health.index(max(cls.corpora_health))
results = cls.corpora[ind].sandwich(word)
# results = [corpus.sandwich(word) for corpus in cls.corpora]
return tokenwrap(results)
开发者ID:jktong,项目名称:content-consumption,代码行数:7,代码来源:managers.py
示例8: common_contexts
def common_contexts(self, words, num=20):
"""
Find contexts where the specified words appear; list
most frequent common contexts first.
:param word: The word used to seed the similarity search
:type word: str
:param num: The number of words to generate (default=20)
:type num: int
:seealso: ContextIndex.common_contexts()
"""
if '_word_context_index' not in self.__dict__:
# print('Building word-context index...')
self._word_context_index = ContextIndex(
self.tokens, key=lambda s: s.lower()
)
try:
fd = self._word_context_index.common_contexts(words, True)
if not fd:
print("No common contexts were found")
else:
ranked_contexts = [w for w, _ in fd.most_common(num)]
print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))
except ValueError as e:
print(e)
开发者ID:prz3m,项目名称:kind2anki,代码行数:27,代码来源:text.py
示例9: gen
def gen(context='', hashtag='', tries=30):
tokens = nltk.word_tokenize(corpus)
text = nltk.Text(tokens)
text.generate(0) #generate model
n = 10
r = tokenwrap(text._trigram_model.generate(n, context))
return r[:140-len(hashtag)]+' '+hashtag
开发者ID:mdamien,项目名称:twitter-poetry,代码行数:8,代码来源:gen.py
示例10: preprocessing
def preprocessing(comment):
"""
Function to clean the comment. Lower all words and remove stop words
"""
words = nltk.word_tokenize(comment)
clean_words = [word.lower() for word in words if word.lower() not in stopwords.words('danish')]
cleaned_comment = tokenwrap(clean_words)
return cleaned_comment
开发者ID:dtu-02819-projects-fall2014,项目名称:InfoMine,代码行数:11,代码来源:gender_classifier.py
示例11: collocations
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text, ignoring stopwords.
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
collocation_strings = [w1 + ' ' + w2 for w1, w2 in self.collocation_list(num, window_size)]
print(tokenwrap(collocation_strings, separator="; "))
开发者ID:rmalouf,项目名称:nltk,代码行数:12,代码来源:text.py
示例12: demo_findall
def demo_findall(text):
while 1:
inp = raw_input('Enter two Chinese words such as "我:2 手:4"(type 0 to exit):');
print "inp='"+ inp+"'"
if inp == '0': break
inp = inp.decode('big5')
reg = "<1> <2> <3> <4> <5>"
if len(inp) == 0:
print 'no input words'
else:
for wp in inp.split(' '):
(w, p) = wp.split(':')
# reg = re.sub(p, w, reg)
reg = re.sub(p, ''.join(['.*', w, '.*']), reg)
reg = re.sub('\d', '.*', reg)
print "reg=", reg
# text.findall(reg)
if "_token_searcher" not in text.__dict__:
text._token_searcher = nltk.text.TokenSearcher(text)
hits = text._token_searcher.findall(reg)
hits = [' '.join(h) for h in hits]
print tokenwrap(hits, u"; ")
开发者ID:dreampocketit,项目名称:bocard,代码行数:22,代码来源:NLTK_tools.py
示例13: generate
def generate(self, length=100, context=()):
"""
Return random text, generated using a trigram language model.
:param length: The length of text to generate (default=100)
:type length: int
:seealso: NgramModel
"""
if '_trigram_model' not in self.__dict__:
print "Building ngram index..."
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
self._trigram_model = NgramModel(3, self, estimator=estimator)
text = self._trigram_model.generate(length, context=context)
return tokenwrap(text)
开发者ID:cssndrx,项目名称:writers-block,代码行数:14,代码来源:nltk_custom.py
示例14: synonyms
def synonyms(word):
## todo: this should move because we want to cache the results so we can calculate health!!
results = []
for synset in wn.synsets(word):
results.extend(synset.lemma_names)
result_set = set(results)
if word in result_set:
result_set.remove(word)
### todo: stopped here... should filter these down to some reasonable thing
############ todo:check if the above needs to be cached somewhere (maybe it is cached by wn.synsets?)
results = list(result_set)
results = results[:MAX_SYNONYMS_TO_RETURN]
return tokenwrap(results)
开发者ID:jktong,项目名称:content-consumption,代码行数:16,代码来源:managers.py
示例15: similar
def similar(self, word, num=20):
"""
Returns as a string similar words
"""
if '_word_context_index' not in self.__dict__:
print 'Building word-context index...'
self._word_context_index = nltk.ContextIndex(self.tokens,
filter=lambda x:x.isalpha(),
key=lambda s:s.lower())
# words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w]
if c in contexts and not w == word)
words = fd.keys()[:num]
return tokenwrap(words)
else:
print "No matches"
开发者ID:cssndrx,项目名称:writers-block,代码行数:22,代码来源:nltk_custom.py
示例16: len
#NLTK processing
words = [ w
for t in status_texts
for w in t.split() ]
nltk_text = nltk.Text(words)
nltk_text.collocations()
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(words, 2)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = nltk.collocations.BigramAssocMeasures()
collocations = finder.nbest(bigram_measures.likelihood_ratio, 20)
colloc_strings = [w1+' '+w2 for w1, w2 in collocations]
#finder = BigramCollocationFinder(word_fd, bigram_fd)
print tokenwrap(colloc_strings, separator="; ")
#create unstylized HTML
summarizedLinks = Counter(urls)
html_file = open('{0}_{1}_statuses.html'.format(data_file, file_time), 'w')
html_file.write('<!DOCTYPE html><html><head></head><body><h1>Analysis of past tweets: "{0}"</h1><h2>{1}</h2>'.format(q, now_time.strftime(fmt) ))
html_file.write('<br /><br /><h2>Collocations of commonly occuring pairs of words</h2>')
html_file.write('<ul>')
for collocation in colloc_strings:
html_file.write('<li>{0}</li>'.format(collocation))
html_file.write('</ul>')
html_file.write('<h2>Most common referenced URLs, unshortened and sorted</h2>')
开发者ID:nealgriffin,项目名称:command_line_twitter_search,代码行数:31,代码来源:twitter_search_anon.py
示例17: wrap
def wrap(iterable):
return tokenwrap(iterable)
开发者ID:cssndrx,项目名称:writers-block,代码行数:2,代码来源:utils.py
示例18: sandwich
def sandwich(cls, word):
"""
"""
results = [corpus.sandwich(word) for corpus in cls.corpora]
return tokenwrap(results)
开发者ID:cssndrx,项目名称:writers-block,代码行数:5,代码来源:managers.py
注:本文中的nltk.util.tokenwrap函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论