本文整理汇总了Python中nltk.tokenize.TreebankWordTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python TreebankWordTokenizer类的具体用法?Python TreebankWordTokenizer怎么用?Python TreebankWordTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TreebankWordTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: find_ml
def find_ml(self, td):
f_tokenizer = TreebankWordTokenizer()
query_words = f_tokenizer.tokenize(td)
genres = self.sentiment_analysis(query_words)
weighted_genres = []
genre_weights = {}
for x in genres:
if x[1] is not None:
weighted_genres.append(x[0])
genre_weights[x[0]] = x[1]
d_score_updates = {}
for movie in self.movies:
g = self.genre_dict[movie][0]
total_genre_score = 0
if u'Comedy' in g and 'comedy' in weighted_genres:
total_genre_score += genre_weights['comedy']
if u'Action' in g and 'action' in weighted_genres:
total_genre_score += genre_weights['action']
if u'Crime' in g and 'crime' in weighted_genres:
total_genre_score += genre_weights['crime']
if u'Drama' in g and 'drana' in weighted_genres:
total_genre_score += genre_weights['drama']
d_score_updates[self.movies.index(movie)] = total_genre_score * .1
return d_score_updates
开发者ID:nporwal,项目名称:cs4300sp2016-moviequotes,代码行数:26,代码来源:find.py
示例2: transformTweetData
def transformTweetData(tweet):
content = unicode(tweet.sentence.lower(), errors='ignore')
words = content.strip().split()
tokenizer = TreebankWordTokenizer()
extra_features = []
content = " ".join(words + extra_features)
tokens = tokenizer.tokenize(content)
tokens = [t for t in tokens if t not in stopwords]
return tokens
开发者ID:prashant-r,项目名称:StanceClassification,代码行数:9,代码来源:preprocess.py
示例3: tokenize_en
def tokenize_en(text):
"""
Return a list of lists of the tokens in text, separated by sentences.
"""
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = TreebankWordTokenizer()
sentences = [tokenizer.tokenize(sentence)
for sentence in sent_tokenizer.tokenize(text)]
return sentences
开发者ID:DSam1991,项目名称:nlpnet,代码行数:9,代码来源:utils.py
示例4: getNoun
def getNoun(self, parser, sentence):
#mysent = sentence.encode('ascii','ignore')
#sent = mysent.decode()
penn = TreebankWordTokenizer()
tags = parser.tag(penn.tokenize(sentence))
the_tags = []
nouns = []
for t in tags:
if t[1].startswith('NN'):
nouns.append(t[0])
return ' '.join(nouns)
开发者ID:sidiksoleman,项目名称:CiteCheck,代码行数:11,代码来源:nounextraction.py
示例5: genLexicon
def genLexicon(data):
tok = TreebankWordTokenizer()
texts = []
for doc in data:
for sent in doc:
texts.append(tok.tokenize( sent[1].lower() ))
dictionary = corpora.Dictionary(texts)
pickle.dump(dictionary, open("lex/toy.lex", "w"))
开发者ID:LEONOB2014,项目名称:NLP-final-project,代码行数:12,代码来源:genFeature.py
示例6: crear_dicc_doc_term
def crear_dicc_doc_term(path):
result = []
result_aux = []
file = open(path)
for f in file:
result.append(f)
tokenizer = TreebankWordTokenizer()
for s in result:
tokenizer = RegexpTokenizer("[\w']+")
temp = tokenizer.tokenize(s)
words = temp
result_aux += eiminar_stopwords(words)
return result_aux
开发者ID:YanetFrancisco,项目名称:NaiveBayesClassifier,代码行数:13,代码来源:pre_procesing_text.py
示例7: section_02_02
def section_02_02( datDIR ):
print("\n### ~~~~~ Section 02.02 ~~~~~~~~");
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
textfile = os.path.join( datDIR , "the-great-gatsby.txt" )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
with open(file = textfile, mode = 'r') as inF:
sentences = []
for i, tempLine in enumerate(inF):
if i > 100:
break
tempLine = tempLine.strip()
sentences.append(tempLine)
print( "%5d: %s" % (i,tempLine) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
mySentence = sentences[20] + " " + sentences[21]
print("\nmySentence:")
print( mySentence )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
#tokens = mySentence.split("([-\s.,;!?])+")
tokens = re.split("([-\s.,;!?])+",mySentence)
temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
print("\ntemp")
print( temp )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myPattern = re.compile("([-\s.,;!?])+")
tokens = myPattern.split(mySentence)
print("\ntokens[-10:]")
print( tokens[-10:] )
temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
print("\ntemp")
print( temp )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+")
print("\nmyRegexpTokenizer.tokenize(mySentence):")
print( myRegexpTokenizer.tokenize(mySentence) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myTreebankWordTokenizer = TreebankWordTokenizer()
print("\nmyTreebankWordTokenizer.tokenize(mySentence):")
print( myTreebankWordTokenizer.tokenize(mySentence) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
开发者ID:paradisepilot,项目名称:statistics,代码行数:50,代码来源:Section_02_02.py
示例8: word_tokenizePT
def word_tokenizePT(self, text, tokenizer):
""" tokenize a portuguese sentence in words
@input params: sentence - a sentence, a phrase (self)
tokenizer - "TB" for TreebankWordTokenizer
"WP" for WordPunctTokenizer
@returns word's list or error """
if tokenizer == "TB":
tokenizerTB = TreebankWordTokenizer()
return tokenizerTB.tokenize(text)
elif tokenizer == "WP":
tokenizerWP = WordPunctTokenizer()
return tokenizerWP.tokenize(text)
else:
return "tokenizer error: not found"
开发者ID:fabiodomingos,项目名称:EADW,代码行数:14,代码来源:NLP_PT.py
示例9: __init__
class TreebankWordTokenizerWrapper:
""" Seriously I don't know why we need this class - this makes no sense """
PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$")
def __init__(self):
self.word_tokenizer = TreebankWordTokenizer()
def tokenize(self, s):
temp = self.word_tokenizer.tokenize(s)
if temp:
it = []
for t0 in temp:
t = [t0]
while True:
m = self.PAT_NLTK_BUG.search(t[0])
if m:
t.insert(0, m.group(1))
t[1] = m.group(2)
else:
break
it += t
#sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t))
else:
it = temp
return it
开发者ID:acli,项目名称:Coursera-subtitles,代码行数:26,代码来源:reformat-extracted-text.py
示例10: __init__
def __init__(self):
self.tokenizer = TreebankWordTokenizer()
self.word_pattern = re.compile(r"^([\w.]*)(\.)(\w*)$")
self.proper_noun = re.compile(r"([A-Z]\.){2,}$")
f = open(get_wpath("transition_words"), "r", encoding="utf8")
transition_word = f.readline()
self.words = r"([.,!?;:])\ *" + transition_word
f.close()
training_sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in training_sents:
tokens.extend(sent)
offset += len(sent)
boundaries.add(offset-1)
# Create training features
featuresets = [(self.punct_features(tokens, i), (i in boundaries))
for i in range(1, len(tokens)-1)
if tokens[i] in '.?!']
train_set = featuresets
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
开发者ID:AntNLP,项目名称:opie,代码行数:27,代码来源:sentence_tokenizer.py
示例11: CRCleaner
class CRCleaner(Cleaner):
def __init__(self, input_dir, output_dir):
super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits)
self.t = TreebankWordTokenizer()
def cleaned_text(self, text):
if len(text) == 0:
return u""
sans_xml = self.xml_to_txt(text)
arr = self.t.tokenize(sans_xml)
return self.reconstruct_arr(arr)
def xml_to_txt(self, xml):
arr = []
dom = parseString(xml)
for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')):
paragraphs = node.getElementsByTagName('paragraph')
if len(paragraphs) > 0:
for node2 in paragraphs:
if node2.hasChildNodes():
child = node2.firstChild
if child.nodeType == child.TEXT_NODE:
arr += [child.data.replace(' ',' ')]
return ' '.join(arr)
def new_filename(self, old_filename):
return old_filename.replace('.xml', '.txt')
开发者ID:jergason,项目名称:topicalguide,代码行数:27,代码来源:clean_cr.py
示例12: _compute_unigram_frequency
def _compute_unigram_frequency(self):
wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
tokenizer = TreebankWordTokenizer()
total = len(wordlists.fileids())
count = 0
fdist = nltk.FreqDist()
for fl in wordlists.fileids():
count += 1
fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
with open(fl_abs_path, 'r') as f:
words = tokenizer.tokenize(f.read())
fdist.update(words)
print 'freqdist: %s of %s' % (count, total)
with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f:
f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()])
return None
开发者ID:Huarong,项目名称:cloze,代码行数:17,代码来源:prepare.py
示例13: _compute_biagram_frequency
def _compute_biagram_frequency(self):
if not os.path.exists(self.bigram_frequency_dir):
os.mkdir(self.bigram_frequency_dir)
wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
tokenizer = TreebankWordTokenizer()
total = len(wordlists.fileids())
count = 0
for fl in wordlists.fileids():
count += 1
print 'freqdist: %s of %s' % (count, total)
fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
with open(fl_abs_path, 'r') as f:
words = tokenizer.tokenize(f.read())
bi_words = nltk.bigrams(words)
fdist = nltk.FreqDist(bi_words)
with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f:
f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()])
return None
开发者ID:Huarong,项目名称:cloze,代码行数:18,代码来源:prepare.py
示例14: eiminar_stopwords
def eiminar_stopwords(words):
a = open('english.txt')
result = []
english_stops = []
for f in a:
result.append(f)
tokenizer = TreebankWordTokenizer()
for s in result:
tokenizer = RegexpTokenizer("[\w']+")
temp = tokenizer.tokenize(s)
english_stops += temp
resultado = []
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for w in words:
if not w in english_stops:
resultado.append(stemmer.stem(w))
return resultado
开发者ID:YanetFrancisco,项目名称:NaiveBayesClassifier,代码行数:18,代码来源:algorithm.py
示例15: DssgUnigramExtractor
class DssgUnigramExtractor(object):
"""
An instance of this is used to obtain a list of unigrams, given a text.
Usages:
unigramExtractor = DssgUnigramExtractor()
tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string']
"""
_cache = {}
def __init__(self):
self._tokenizer = TreebankWordTokenizer()
self._stopwordSet = set(stopwords.words("english"))
self._stemmer = PorterStemmer()
def __repr__(self):
return self.__class__.__name__ + "()"
def extract(self, text):
"""
Given a text, return a list of unigram tokens.
"""
if text not in DssgUnigramExtractor._cache:
text = (
text.replace("<", "<")
.replace(">", ">")
.replace(""", '"')
.replace("&", "&")
.replace(" ", " ")
)
text = nltk.clean_html(text)
tokens = self._tokenizer.tokenize(text)
newTokens = []
for tok in tokens:
# - lowercase, remove '
tok = tok.lower().strip("`'.,-_*/:;\\[email protected]#$%^&*()=\"")
# - remove stopwords, one character word, only numbers
# - remove one character word
# - remove only numbers
if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok):
continue
# - apply stemming
# oldTok = copy.deepcopy(tok); # for debug
tok = self._stemmer.stem(tok)
# sometimes a token is like 'theres' and becomes stopword after
# stemming
if tok in self._stopwordSet:
continue
newTokens.append(tok)
DssgUnigramExtractor._cache[text] = newTokens
return DssgUnigramExtractor._cache[text]
开发者ID:pombredanne,项目名称:ushine-learning,代码行数:56,代码来源:vectorizer.py
示例16: stopwords
def stopwords(filename):
"""A function that returns a dictionary with tokens as keys
and counts of how many times each token appeared as values in
the file with the given filename.
Inputs:
filename - the name of a plaintext file with a document on each line
Outputs:
A list of stopwords and a dictionary mapping tokens to counts.
"""
# We now track the number of times a word shows up (term frequency) and
# the number of documents with a given word in it (document frequency)
# separately. We use a Counter, which is exactly like a dictionary except
# - the values can only be ints
# - any key it hasn't seen yet is assumed to already have a value of 0
# This means we don't have to check whether we've used a key before when
# we use the "+= 1" operation.
term_frequency_dict = Counter()
word_total = 0
tokenizer = TreebankWordTokenizer()
with open(filename, 'r') as f:
for line in f:
words = tokenizer.tokenize(line.lower())
# For the programmer types: there are several more efficient
# ways to write this section using dictionaries or sets. You're
# welcome to rewrite this part to exercise that.
for word in words:
term_frequency_dict[word] += 1
word_total += 1
# A fun feature of Counters is that they have a built-in function that
# gives you the n keys with the biggest values, or the "most common"
# things being counted. We can use this to find the most common words.
# This comes out as a list of pairs of key and value, like
# [('foo', 10), ('bar', 7), ... , ('rare', 1)]
stoplist_pairs = term_frequency_dict.most_common(100)
stoplist = [word for (word, freq) in stoplist_pairs]
return stoplist, term_frequency_dict, word_total
开发者ID:niutyut,项目名称:info-3350-fall-2015,代码行数:43,代码来源:ex3.py
示例17: tokenize
def tokenize(text, stopword=False, punct=False, lower=False,
stem=False, num=False, single=False, link=False):
"""
num: True, exclude numbers
single: True, exclude single char
todo: deal with unicode mafuckers
"""
token = []
tokenizer = TreebankWordTokenizer()
token_temp = tokenizer.tokenize(text)
for elt in token_temp:
#temp = i.decode('unicode-escape')
#temp = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+',
# lambda m: m.group(0).encode('latin1').decode('utf8'), temp)
temp = unicode(elt)
temp = unicodedata.normalize('NFKD', temp).encode('ascii', 'ignore')
# get rid of empty strings
#temp = i
if temp:
token.append(temp)
token = [clean_front_end(word) for word in token if clean_front_end(word)]
if lower:
token = [word.lower() for word in token]
if stem:
token = [stemmer.stem(word) for word in token]
if num:
token = [word for word in token if not is_number(word)]
if single:
token = [word for word in token if len(word) > 1]
if stopword:
token = [word for word in token if word not in STOPWORD]
if punct:
token = [word for word in token if word not in PUNCT]
if link:
token = [word for word in token if not is_link(word)]
#exclude empty strings
token = [word for word in token if word]
return token
开发者ID:sysofwan,项目名称:zapfeeds,代码行数:43,代码来源:algorithm.py
示例18: sentences
def sentences(self, lowercase=False, strip_punct=[], num_placeholder=None):
word_tokenizer=TreebankWordTokenizer()
sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle')
token_sents = [word_tokenizer.tokenize(sent) for sent in sent_tokenizer.tokenize(self.response)]
if lowercase:
token_sents = [[token.lower() for token in sent] for sent in token_sents]
if len(strip_punct) > 0:
token_sents = [[token for token in sent if token not in strip_punct] for sent in token_sents]
if num_placeholder is not None:
def replace_num(token, placeholder):
try:
float(token.replace(',',''))
return placeholder
except ValueError:
return token
token_sents = [[replace_num(token, num_placeholder) for token in sent] for sent in token_sents]
return token_sents
开发者ID:mrmacthree,项目名称:hlt,代码行数:21,代码来源:essay.py
示例19: __init__
class MorphyStemmer:
def __init__(self):
self.tokenizer = TreebankWordTokenizer()
def __call__(self, doc):
stemmed_doc = []
for t in self.tokenizer.tokenize(doc):
stem = wordnet.morphy(t)
if stem:
stemmed_doc.append(stem.lower())
else:
stemmed_doc.append(t.lower())
return stemmed_doc
开发者ID:sangheestyle,项目名称:nlp2014,代码行数:13,代码来源:quiz_king.py
示例20: test_treebank_span_tokenizer
def test_treebank_span_tokenizer(self):
"""
Test TreebankWordTokenizer.span_tokenize function
"""
tokenizer = TreebankWordTokenizer()
# Test case in the docstring
test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
expected = [
(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
(24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
(40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
(60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)
]
result = tokenizer.span_tokenize(test1)
self.assertEqual(result, expected)
# Test case with double quotation
test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
expected = [
(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
(27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
(65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102),
(103, 109)
]
result = tokenizer.span_tokenize(test2)
self.assertEqual(result, expected)
# Test case with double qoutation as well as converted quotations
test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
expected = [
(0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
(27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
(65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96),
(97, 99), (100, 106), (107, 113)
]
result = tokenizer.span_tokenize(test3)
self.assertEqual(result, expected)
开发者ID:alpaco42,项目名称:ML_Spring_2018,代码行数:39,代码来源:test_tokenize.py
注:本文中的nltk.tokenize.TreebankWordTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论