本文整理汇总了Python中nltk.stem.snowball.EnglishStemmer类的典型用法代码示例。如果您正苦于以下问题:Python EnglishStemmer类的具体用法?Python EnglishStemmer怎么用?Python EnglishStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了EnglishStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: str_to_dict
def str_to_dict(s):
'''
creates dictionary of words and counts
input: s string
output: dictionary {word: count}
'''
s = s.encode('ascii','ignore')
s = str(s)
word_dict = {}
l = re.findall(WORDRE, s)
for w in l:
w = w.lower() # make all letters lowercase
if w[0] == "'": # remove single quotes from beginning/
w = w[1:] # end of words in l
elif w[-1] == "'":
w = w[:-1]
w = EnglishStemmer().stem(w) # stems non-noun/verbs
w = w.encode('ascii','ignore')
if w != '':
if w not in word_dict: # build dictionary
word_dict[w] = 1
else:
word_dict[w] += 1
return word_dict
开发者ID:ccr122,项目名称:ccr,代码行数:28,代码来源:parse.py
示例2: getAllStemEntities
def getAllStemEntities(entities):
st = EnglishStemmer()
q = [",", ".", "!", "?", ":", ";"]
tmp = []
sourceEntities = [x for x in entities if len(x) > 0]
np.random.shuffle(entities)
for i in xrange(len(entities)):
if len(entities[i]) == 0:
continue
if i % 1000 == 0:
print i
entities[i] = entities[i].lower()
entities[i] = entities[i].replace(" - ", " \u2013 ", entities[i].count(" - "))
entities[i] = entities[i].replace(" -", " \u2013", entities[i].count(" -"))
entities[i] = entities[i].replace("- ", "\u2013 ", entities[i].count("- "))
entities[i] = entities[i].replace("-", " - ", entities[i].count("-"))
entities[i] = entities[i].replace(")", " )", entities[i].count(")"))
entities[i] = entities[i].replace("(", "( ", entities[i].count("("))
entities[i] = entities[i].replace("\u0027", " \u0027", entities.count("\u0027"))
for w in q:
entities[i] = entities[i].replace(w, " " + w, entities[i].count(w))
word = entities[i].split(" ")
s = ""
for w in word:
s += st.stem(unicode(w)) + " "
tmp.append(s[:-1])
if len(tmp) > 50:
break
return tmp, entities[: len(tmp)]
开发者ID:mikhaylova-daria,项目名称:NER,代码行数:31,代码来源:allFunctions.py
示例3: Granularity
def Granularity(sentenceArray):
for sentence in sentenceArray:
# print(sentence)
try:
stemmer = EnglishStemmer()
sentence = re.sub(r'\#.*?$', '', sentence)
sentence = re.sub(r'\#.*? ', '', sentence)
sentence = re.sub(r'\@.*?$', '', sentence)
sentence = re.sub(r'\@.*? ', '', sentence)
sentence = re.sub(r'pic.twitter.*?$', '', sentence)
sentence = re.sub(r'pic.twitter.*? ', '', sentence)
sentence = re.sub(r'\'m', ' am', sentence)
sentence = re.sub(r'\'d', ' would', sentence)
sentence = re.sub(r'\'ll', ' will', sentence)
sentence = re.sub(r'\&', 'and', sentence)
sentence = re.sub(r'don\'t', 'do not', sentence)
data = stemmer.stem(sentence)
print(data)
from nltk.corpus import stopwords
sentence = str(data)
stop = stopwords.words('english')
final = [i for i in sentence.split() if i not in stop]
finalstring = ' '.join(final)
os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word)
except Exception as e:
print(e)
开发者ID:PgnDvd,项目名称:SNLP,代码行数:29,代码来源:Stemmer.py
示例4: query
def query(word):
db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" )
cursor=db.cursor()
snowball_stemmer = EnglishStemmer()
stem2 = snowball_stemmer.stem(word)
cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
rows = cursor.fetchall()
words1 = dict()
words2 = dict()
for row in rows:
if row[1] == word or row[3]==word:
words1[word] = row[0]
else:
words2[word] = row[0]
scenes1 = []
scenes2 = []
for (i,words_dict) in [(1,words1), (2,words2)]:
wids = words_dict.values()
for wid in wids:
sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
"WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
# print sql
cursor.execute(sql)
rows = cursor.fetchall()
if (i==1): scenes1 += rows
else: scenes2 += rows
print scenes1
print scenes2
return scenes1 + scenes2
db.close()
开发者ID:yasinzor,项目名称:videosozluk,代码行数:30,代码来源:query_word.py
示例5: _execute
def _execute(self):
corpus = mongoExtractText(self.name)
stemmer = EnglishStemmer()
for item in corpus:
line = item.replace(',', ' ')
stemmed_line = stemmer.stem(line)
self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
开发者ID:cevaris,项目名称:nebula,代码行数:8,代码来源:mining.py
示例6: stem_word
def stem_word(word):
"""
Stem words
:param word: (str) text word
:returns: stemmed word
"""
stemmer = EnglishStemmer()
return stemmer.stem(word)
开发者ID:vipul-sharma20,项目名称:tweet-analysis,代码行数:8,代码来源:util.py
示例7: as_eng_postagged_doc
def as_eng_postagged_doc(doc):
'''Uses nltk default tagger.'''
tags = [t for _, t in nltk.pos_tag(list(doc.word))]
stemmer = EnglishStemmer()
lemmata = [stemmer.stem(w) for w in list(doc.word)]
doc['pos'] = Series(tags)
doc['lemma'] = Series(lemmata)
return doc
开发者ID:estnltk,项目名称:pfe,代码行数:8,代码来源:corpus.py
示例8: use_snowball_stemmer
def use_snowball_stemmer(self,word):
"""
return stemmed words used snowball algorithm
:param word:
:return:
"""
englishStemmer=EnglishStemmer()
stemmed_word= englishStemmer.stem(word)
return stemmed_word
开发者ID:soumik-dutta,项目名称:Keyword-Extraction,代码行数:9,代码来源:Stemming.py
示例9: getLemmatizerInfo
def getLemmatizerInfo(pathArticle):
data = open(pathArticle, "r")
text1 = data.read().decode('utf-8')
sourceText = text1
links1 = []
l = 0
for q in text1.split():
if q == '\ufeff':
continue
links1.append([text1.find(q,l), q])
l = len(q) + 1 + text1.find(q,l)
text1 = text1.replace(' - ', ' \u2013 ', text1.count(' - '))
text1 = text1.replace(' -', ' \u2013', text1.count(' -'))
text1 = text1.replace('- ', '\u2013 ', text1.count('- '))
text1 = text1.replace('-', ' - ', text1.count('-'))
text1 = text1.replace('(', '( ', text1.count('('))
text1 = text1.replace(')', ' )', text1.count(')'))
text1 = text1.replace(' \u0027', ' \u301E', text1.count(' \u0027'))
text1 = text1.replace('\u0027', ' \u0027', text1.count('\u0027'))
text1 = text1.split()
if text1[0] == u'\ufeff':
text1=text1[1:]
text = []
for word in text1:
text2 = []
if len(word) == 0:
continue
while word[len(word)-1] in [',','.','!','?',':',';']:
text2.append(word[len(word)-1])
word = word[:-1]
if len(word) == 0:
break
text.append(word)
for i in range(len(text2)-1, -1,-1):
text.append(text2[i])
out = ''
st = EnglishStemmer()
l = 0
links = []
for word in text:
if isOk(word):
q = st.stem(word) + ' '
else:
q = word + ' '
out += q.lower()
links.append([l, q])
l += len(q)
return out, links, links1, sourceText
开发者ID:mikhaylova-daria,项目名称:NER,代码行数:55,代码来源:allFunctions.py
示例10: stemming
def stemming(tweet):
tweets = tweet.split()
wrdStemmer = EnglishStemmer()
stemTweet =[]
try:
for tweet in tweets:
tweet = wrdStemmer.stem(tweet)
stemTweet.append(tweet)
except:
print("Error: Stemming")
return " ".join(stemTweet)
开发者ID:RohithEngu,项目名称:Opinion-Summarizer,代码行数:11,代码来源:PreProcessing.py
示例11: fix_lemma_problem
def fix_lemma_problem(pred_scores, targets, space):
from nltk.stem.snowball import EnglishStemmer
es = EnglishStemmer()
r = pred_scores.copy()
lemmas = np.array([es.stem(v) for v in space.vocab])
for i, t in enumerate(targets):
g = es.stem(space.vocab[t])
mask = (lemmas == g)
#print space.vocab[t], np.sum(mask)
r[i][mask] = -1e9
#print r[i][mask]
return r
开发者ID:stephenroller,项目名称:naacl2016,代码行数:12,代码来源:lexsub.py
示例12: get_stemmed_keywords
def get_stemmed_keywords(keywords):
stemmer = EnglishStemmer()
stemmed_keywords = list(keywords)
# split into list of list
stemmed_keywords = [keyword.split() for keyword in stemmed_keywords]
# stem individual words
stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords]
# list of words to string
stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords]
return stemmed_keywords
开发者ID:bohrjoce,项目名称:keyword-extraction,代码行数:12,代码来源:evaluate_multiple.py
示例13: main
def main(fname):
e = EnglishStemmer()
n, a = 0, 0
for line in open(sys.argv[1]):
title, body, tags, creationdate, acceptedanswerid, score, viewcount = eval(line)
# Process text into tokens
html_tags = RX_OPEN_TAGS.findall(body)
body = RX_TAGS.sub("",body)
print " ".join(e.stem(s) for s in RX_NONWORD.split(body))
M = bayes.NaiveLearner(adjust_threshold=True, name="Adjusted Naive Bayes")
开发者ID:andrewdyates,项目名称:signalfire_sap,代码行数:12,代码来源:parse2.py
示例14: stemmed
def stemmed(text, snowball=False):
"""Returns stemmed text
"""
if snowball:
st = EnglishStemmer()
else:
st = PorterStemmer()
words = wordpunct_tokenize(text)
words = [st.stem(w) for w in words]
text = ' '.join(words)
return text
开发者ID:soodoku,项目名称:search-names,代码行数:12,代码来源:preprocess.py
示例15: similarity_score
def similarity_score(word1, word2):
""" see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
:type word1: string
:type word2: string
:return: float: between 0 and 1; similarity between two given words
"""
stemmer = EnglishStemmer()
if stemmer.stem(word1) == stemmer.stem(word2):
return 1
alpha = 0.2
beta = 0.6
l, h = get_path_length_and_subsumer_height(word1, word2)
return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
开发者ID:ReganBell,项目名称:QReview,代码行数:13,代码来源:Analyze.py
示例16: normalize_tags
def normalize_tags():
cursor.execute('SELECT app_id, tag, times FROM tag_app_rel;')
all_tag_data = defaultdict(dict)
for r in cursor:
all_tag_data[r[0]][r[1]] = r[2]
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
for app_id, tag_to_times in all_tag_data.iteritems():
normalized_app_tag_dict = defaultdict(int)
for tag, times in tag_to_times.iteritems():
normalized_app_tag_dict[stemmer.stem(tag)] += times
for tag, times in normalized_app_tag_dict.iteritems():
cursor.execute('INSERT INTO tag_app_relation (app_id, tag, times) VALUES (%s, %s, %s)', (app_id, tag, times))
开发者ID:demien-aa,项目名称:CodingMan,代码行数:13,代码来源:services.py
示例17: nltk_tokenizer
def nltk_tokenizer(text, min_size=4, *args, **kwargs):
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords as stwds
from nltk.tokenize import TreebankWordTokenizer
stemmer = EnglishStemmer()
stopwords = set(stwds.words('english'))
text = [stemmer.stem(w) for w in TreebankWordTokenizer().
tokenize(text) if not w in stopwords
and len(w) >= min_size]
return text
开发者ID:danielcestari,项目名称:machine_learning,代码行数:13,代码来源:naive.py
示例18: tokenize_documents
def tokenize_documents(documents):
stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered
english = EnglishStemmer()
arabic = ISRIStemmer()
punctuation = { ord(char): None for char in string.punctuation}
def valid_word(token, filtered=stop_words):
# Returns false for common words, links, and strange patterns
if (token in filtered) or (token[0:4] == u'http') or\
(token in string.punctuation):
return False
else:
return True
for doc in documents:
row = doc[0]
doc = doc[1]
if doc is not None:
# remove trailing whitespace
doc = doc.strip()
# remove twitter handles (words in doc starting with @)
doc = re.sub(r"@\w+|\[email protected]\w+", "", doc)
# lowercase letters
doc = doc.lower()
# remove punctuation
doc = doc.translate(punctuation)
# tokenization: handles documents with arabic or foreign characters
tokens = nltk.tokenize.wordpunct_tokenize(doc)
cleaned_tokens = []
for token in tokens:
# for valid words, correct spellings of gaddafi and stem words
if valid_word(token):
if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']:
token = u'gaddafi'
else:
token = arabic.stem(english.stem(token))
cleaned_tokens.append(token)
yield row
yield cleaned_tokens
开发者ID:sharonxu,项目名称:nlp-twitter,代码行数:50,代码来源:process_text.py
示例19: stem_sen
def stem_sen(list_sentences):
stemmer = EnglishStemmer()
# map back should be a dict with words,
# each word map to 3 version: noun, adj, verb,
# and each version is a list of pair
lem = WordNetLemmatizer()
mapping_back = {}
res_list = []
res_sen = []
stemmer = EnglishStemmer()
# of course we want to return a list of sentences back as well
for sent in list_sentences:
tmp_list = []
tok_list = word_tokenize(sent)
tok_pos = nltk.pos_tag(tok_list)
for tok,pos in tok_pos:
if (tok.lower() in stopwords.words('english')):
continue
if len(tok) == 1:
continue
tok = lem.lemmatize(tok)
pos = pos[:2]
if ('NN' not in pos) and ('JJ' not in pos) and ('VB' not in pos):
continue
stem_tok = stemmer.stem(tok)
if (stem_tok not in mapping_back):
mapping_back[stem_tok] = {}
if pos not in mapping_back[stem_tok]:
mapping_back[stem_tok][pos] = {}
# increase count
if tok not in mapping_back[stem_tok][pos]:
mapping_back[stem_tok][pos][tok] = 1
else:
mapping_back[stem_tok][pos][tok] += 1
tmp_list.append(stem_tok + '-' + pos)
res_sen.append(tmp_list)
res_map = {}
# do the second run through to find the most frequent - mapping
for tok in mapping_back:
for pos in mapping_back[tok]:
tmp_tok = tok + '-' + pos
# find the most frequently, unstemmed word correspond to the stemmer + tagged
most_freq = max(mapping_back[tok][pos], key = mapping_back[tok][pos].get)
res_map[tmp_tok] = most_freq.encode('ascii')
res_list.append(tmp_tok)
return res_sen, res_list, res_map
开发者ID:bohrjoce,项目名称:keyword-extraction,代码行数:49,代码来源:feature_extract.py
示例20: tokenize
def tokenize(self):
terms = word_tokenize(self.text);
self.tokens = [];
self.lemmas = []
stemmer = EnglishStemmer();
lemmatizer = WordNetLemmatizer()
for term in terms:
try:
self.tokens.append(stemmer.stem(term).lower())
self.lemmas.append(lemmatizer.lemmatize(term.lower()))
except Exception, e:
print 'current text:', self.text;
print 'current term:', term;
print str(e);
sys.exit(-1);
开发者ID:DrDub,项目名称:window_shopper,代码行数:15,代码来源:WindowExtractor.py
注:本文中的nltk.stem.snowball.EnglishStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论