本文整理汇总了Python中nltk.stem.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PorterStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _stemmatize
def _stemmatize(self, word):
lmtzr = WordNetLemmatizer() # lemmatizer won't stem words ending in '-ing' unless you tell it it's a verb
stemmer = PorterStemmer()
if word.endswith('ing'):
return stemmer.stem(word)
return lmtzr.lemmatize(word)
开发者ID:stong1108,项目名称:CL_missedconn,代码行数:7,代码来源:TopicModeling.py
示例2: porter_list1
def porter_list1(lista):
stemmer = PorterStemmer()
newlist = []
for b in lista:
b = stemmer.stem(b)
newlist.append(b)
return newlist
开发者ID:ASpathoulas,项目名称:MSc-Courseworks,代码行数:7,代码来源:best.py
示例3: splitAndStem
def splitAndStem(inputfilename, outputfilename):
'''
For each ingredient split it into words, stem each word, construct a new recipe from those words
:param inputfilename:
:return:
'''
with open(outputfilename, 'w') as ff:
ff.write('[\n')
with open(inputfilename) as f:
d = eval(f.read())
stemmer = PorterStemmer()
with open(outputfilename, 'a') as ff:
for i in d:
# print(i)
new_item = {}
new_ingredients = []
for ingredient in i['ingredients']:
tokens = word_tokenize(ingredient)
clean_tokens = [re.subn('[^A-Za-z]', '', token)[0] for token in tokens]
new_ingredients += [stemmer.stem(w).lower() for w in clean_tokens]
new_item['cuisine'] = i['cuisine']
new_item['id'] = i['id']
new_item['ingredients'] = new_ingredients
json_recipe = json.dumps(new_item)
ff.write('%s,\n' % str(json_recipe))
开发者ID:sashavtyurina,项目名称:What-s-cooking,代码行数:29,代码来源:Alex_whatscooking.py
示例4: parseReviews
def parseReviews(mypath):
filelist = os.listdir(mypath)
wordDict = {}
negationList = ["no","not","never","can't","won't","cannot","didn't","couldn't"]
negationFlag = False
stopwordList = set(stopwords.words("english"))
stemmer = PorterStemmer()
for file in filelist:
with open(mypath + "/" + file,"r") as f:
word_list = word_tokenize(f.read())
for word in word_list:
if word in negationList:
#double negative
if negationFlag:
negationFlag = False
else:
negationFlag = True
continue
if not word.isalnum():
negationFlag = False
if word.isalnum() and word not in stopwordList:
word = stemmer.stem(word)
if negationFlag:
word = "!" + word
negationFlag = False
if word not in wordDict:
wordDict[word] = 1
else:
wordDict[word] += 1
return wordDict
开发者ID:sagardmni,项目名称:sentiment_classification,代码行数:30,代码来源:train.py
示例5: tokenizeTags
def tokenizeTags(str,dict_items):
#temp map (for getting the local term frequency)
#for a sentence
str =str.decode('ascii', 'ignore')
#tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
#tokens=tokenizer.tokenize(str)
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
#small set of stopwords (remove you, are, and, I those kinds of words)
last =[]
#bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='' and c not in dict_items:
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
#index= len(last)
#if index>1:
# bigram = last[index-2]+' '+last[index-1]
# bigram_list.append(bigram)
return last
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:33,代码来源:topic_model.py
示例6: tokenize2_bigram
def tokenize2_bigram(str,df_freq):
temp_map={}
#for a sentence
str =str.decode('ascii', 'ignore')
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
last =[]
bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='':
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
#c = stemmer.stem(c.lower())
last.append(c)
#bigram generation
index= 0
if index>1:
bigram = last[index-2]+' '+last[index-1]
bigram_list.append(bigram)
updateDF(temp_map,df_freq,bigram)
index+=1
return bigram_list
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:34,代码来源:topic_model.py
示例7: openfile
def openfile(filename,output):
print(filename)
#starts run time
start = timeit.default_timer()
ps = PorterStemmer()
file = open(filename,"r")
tokens = []
#Used for removing punctuation from the documents
translate_table = dict((ord(char), None) for char in string.punctuation)
start2 = timeit.default_timer()
#splits the lines into words and removes the punctuation
for line in file:
tokens += word_tokenize(line.translate(translate_table) )
start3 = timeit.default_timer()
print("tokenize")
print(start3 - start2)
#creates a set of stop words to be removed later
stop_words = set(stopwords.words("english"))
start6 = timeit.default_timer()
#if a word is not a stop word it adds it to a list
filtered_sentence = []
for w in tokens:
if w not in stop_words:
filtered_sentence.append(w)
start7 = timeit.default_timer()
print("stop word removal")
print(start7 - start6)
startw = timeit.default_timer()
#stems each word and adds it to the output file in csv form
f = open(output,'w')
iterFilSen = iter(filtered_sentence)
if output == "documents.csv":
for w in filtered_sentence:
if w == "I":
f.write("\n")
f.write(ps.stem(w))
f.write(",")
else:
for w in iterFilSen:
if w == "I":
f.write("\n")
#removes the I number W
next(iterFilSen)
next(iterFilSen)
else:
f.write(ps.stem(w))
f.write(",")
#ends run time
stop = timeit.default_timer()
print("writing")
print(stop - startw)
print("total: "+output)
print(stop - start)
开发者ID:SomeUserName-ForMe,项目名称:InvertedIndex,代码行数:60,代码来源:stemmer.py
示例8: testing
def testing():
# - tokenize on sentence and word
ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
print(sent_tokenize(ex_txt))
print(word_tokenize(ex_txt, language='english'))
# - stop words (pre-defined by nltk)
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(ex_txt)
print(words)
filtered_sent = []
for w in words:
if w not in stop_words:
filtered_sent.append(w)
print(filtered_sent)
filtered_sent = [w for w in words if not w in stop_words]
print(filtered_sent)
# - stemming
ps = PorterStemmer()
example_words = [python,pythoner,pythoning,pythoned,pythonly]
# for w in example_words:
# print(ps.stem(w))
new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
开发者ID:gbartusk,项目名称:coursera_data_science_capstone,代码行数:28,代码来源:capstone.py
示例9: prepare_data
def prepare_data(reviews):
# run porter stemmer on every word
stemmer = PorterStemmer()
stem_text = lambda x: {'class': x['class'],
'text': stemmer.stem(x['text'])}
# clean text and remove empty items
reviews = filter(lambda x: x != {}, reviews)
reviews = map(stem_text, reviews)
print('classification: ' + reviews[observed_element]['class'] + '\n\n------------------------------------\n\n')
print('stemming: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
# remove stopwords
reviews = map(remove_stop_words, reviews)
print('stopwords: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
# remove undesired patterns
reviews = map(clean_text, reviews)
print('elementos inuteis: ' + reviews[observed_element]['text'] + '\n\n------------------------------------\n\n')
return reviews
开发者ID:teago19,项目名称:sentimentAnalysis,代码行数:25,代码来源:classify.py
示例10: extract_clean_sentences
def extract_clean_sentences(self):
"""
Extracts sentences from plain text. Also applies the following cleaning
operations:
- Exclude all characters not recognized by 'utf-8' encoding
- Exclude all characters not contained in [a-zA-Z0-9 '-]
- Exclude common stopwords
"""
text = self.raw_text
exclude = re.compile('[^a-zA-Z0-9 \'-]')
linebreaks = re.compile('\s')
excess_space = re.compile('\s+')
stemmer = PorterStemmer()
sentences = sent_tokenize(text)
out = []
for sentence in sentences:
sentence = linebreaks.sub(' ', sentence)
sentence = exclude.sub(' ', sentence)
sentence = excess_space.sub(' ', sentence)
tokens = word_tokenize(sentence)
tokens = [stemmer.stem(t.lower()) for t in tokens]
out.append(tokens)
return out
开发者ID:flinder,项目名称:human_rights_text,代码行数:27,代码来源:import_reports.py
示例11: preprocess_document
def preprocess_document(doc):
stopset = set(stopwords.words('english'))
stemmer = PorterStemmer()
tokens = wordpunct_tokenize(doc)
clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
final = [stemmer.stem(word) for word in clean]
return final
开发者ID:mrquant,项目名称:InfoRetrievalSystem,代码行数:7,代码来源:irs.py
示例12: preprocess
def preprocess(text):
stemmer = PorterStemmer()
stop = stopwords.words('english')
tokens = [tok for tok in word_tokenize(text.lower())
if tok not in stop]
tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
return tokens_stemmed
开发者ID:kedz,项目名称:newsblaster,代码行数:7,代码来源:cluster.py
示例13: preprocessing
def preprocessing(text, debug = False):
if debug:
print text
# lower case
text = text.lower()
if debug:
print text
# can't -> cannot, bya's -> bya is
text = replacers.RegexpReplacer().replace(text)
if debug:
print text
# word tokenize
words = word_tokenize(text)
if debug:
print words
# removing stopwords
english_stops = set(stopwords.words('english'))
english_stops_added = english_stops | {'.', ',', ':', ';'}
words = [word for word in words if word not in english_stops_added]
if debug:
print words
# stemming words
stemmer = PorterStemmer()
words_stemmed = list(map(lambda word: stemmer.stem(word), words))
if debug:
print words_stemmed
return words, words_stemmed
开发者ID:Muugii-bs,项目名称:hommie,代码行数:33,代码来源:utils.py
示例14: buildVocab
def buildVocab(self):
'''Build a vocabulary for the selected documents (from dir database).'''
## Note: The source of text should be Lucene processed field values. Lucene tokenized the text, remove stop words, and may take other unknown steps.
## Right now the vocabulary is built on the raw text with NLTK based stopwords removal, and tokenization. This should be improved.
# collect contents from /database/ for each of these doc
for pmid in self.pmidList: # self.pmidList includes the query and the 99 most similar articles selected by BM25
self.corpus.append(file(os.path.join(self.dbDir,pmid)).read()) # corpus contains raw text (MH, title*2, abstract)
for text in self.corpus:
sent_tokenize_list = sent_tokenize(text.strip().lower(), "english") # tokenize an article text
stemmed_text = []
if sent_tokenize_list: # if sent_tokenize_list is not empty
porter_stemmer = PorterStemmer()
for sent in sent_tokenize_list:
words = TreebankWordTokenizer().tokenize(sent) # tokenize the sentence
words = [word.strip(string.punctuation) for word in words]
words = [word for word in words if not word in stopwords.words("english")]
words = [word for word in words if len(word)>1] # remove single letters and non alphabetic characters
words = [word for word in words if re.search('[a-zA-Z]',word)]
words = [porter_stemmer.stem(word) for word in words] # apply Porter stemmer
stemmed_text.append(" ".join(words))
self.vocab+=words
self.stemmed_corpus.append(". ".join(stemmed_text)) # append a stemmed article text
# save stemmed corpus
pickle.dump(self.stemmed_corpus, file(os.path.join(self.stemmed_corpusDir,str(self.pmidList[0])),"w"))
# remove low frequency tokens and redundant tokens
tokenDist = Counter(self.vocab)
lowFreqList = []
for token, count in tokenDist.iteritems():
if count<2:
lowFreqList.append(token)
self.vocab = list(set(self.vocab)-set(lowFreqList))
# save vocabulary
pickle.dump(self.vocab,file(os.path.join(self.vocabDir,str(self.pmidList[0])),"w"))
开发者ID:w2wei,项目名称:XPRC,代码行数:33,代码来源:RetKNN_MPRC.py
示例15: StemmedBagOfWordsFeatureGenerator
class StemmedBagOfWordsFeatureGenerator(EdgeFeatureGenerator):
"""
Generates stemmed Bag of Words representation for each sentence that contains
an edge, using the function given in the argument.
By default it uses Porter stemmer
:type feature_set: nala.structures.data.FeatureDictionary
:type stemmer: nltk.stem.PorterStemmer
:type stop_words: list[str]
:type training_mode: bool
"""
def __init__(self, feature_set, stop_words=[], training_mode=True):
self.feature_set = feature_set
"""the feature set for the dataset"""
self.training_mode = training_mode
"""whether the mode is training or testing"""
self.stemmer = PorterStemmer()
"""an instance of the PorterStemmer"""
self.stop_words = stop_words
"""a list of stop words"""
def generate(self, dataset):
for edge in dataset.edges():
sentence = edge.part.sentences[edge.sentence_id]
if self.training_mode:
for token in sentence:
if self.stemmer.stem(
token.word
) not in self.stop_words and not token.features['is_punct']:
feature_name = '4_bow_stem_' + self.stemmer.stem(
token.word) + '_[0]'
self.add_to_feature_set(edge, feature_name)
开发者ID:Rostlab,项目名称:relna,代码行数:34,代码来源:sentence.py
示例16: stemText
def stemText(text):
ps = PorterStemmer()
words = word_tokenize(text)
#all_words = [];
for w in words:
all_words.append(ps.stem(w))
开发者ID:sdsunjay,项目名称:rideshare,代码行数:7,代码来源:train_classifier.py
示例17: parseTranscript
def parseTranscript(transcript):
assert isinstance(transcript, Transcript), \
"transcript must be stored in custom namedtuple, not {}".format(type(transcript))
text = transcript.prepared.append(transcript.QandA)
id = "{ticker}-{year}-{month}-{day}".format(ticker=transcript.ticker.split(':')[-1],
year=transcript.date.year,
month=transcript.date.month,
day=transcript.date.day)
tokenizer = wordpunct_tokenize
stemmer = PorterStemmer()
index = dict()
pos = 0
for row in text:
for i, token in enumerate(tokenizer(row.lower())):
token = stemmer.stem(token)
if token not in index and '|' not in token:
index[token] = [id, [str(pos + i)]]
elif '|' not in token:
index[token][-1].append(str(pos + i))
try:
pos += (i + 1)
except:
pass
return index
开发者ID:trevorlindsay,项目名称:earnings-calls,代码行数:31,代码来源:build_index.py
示例18: tokenize2
def tokenize2(str,df_freq):
#temp map (for getting the local term frequency)
temp_map={}
#for a sentence
str =str.decode('ascii', 'ignore')
#tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
#tokens=tokenizer.tokenize(str)
tokens = str.split()
#print tokens
stemmer = PorterStemmer()
#small set of stopwords (remove you, are, and, I those kinds of words)
last =[]
#bigram_list=[]
for d in tokens:
d = d.split('-')
for c in d:
c=re.compile('[%s]' % re.escape(string.punctuation)).sub('', c)
#regular expression -> strip punctuations
if c!='':
try:
if int(c):
if len(c)!=4 and (c>2015 or c<1900): #keep years
c=stemmer.stem('NUM')
except Exception:
c = stemmer.stem(c.lower())
pass
last.append(c)
updateDF(temp_map,df_freq,c)
开发者ID:wingsrc,项目名称:musicRecommendation_topicmodeling,代码行数:31,代码来源:topic_model.py
示例19: get_english_vocab
def get_english_vocab(lemmatize=False):
vocab = (w.lower() for w in words.words())
if lemmatize:
stemmer = PorterStemmer()
vocab = (stemmer.stem(w) for w in vocab)
return set(vocab)
开发者ID:ned2,项目名称:okdata,代码行数:7,代码来源:okreader.py
示例20: new_lesk
def new_lesk(context_sentence, ambiguous_word, pos=None, stem=True, hyperhypo=True):
ps = PorterStemmer()
max_overlaps = 0; lesk_sense = None
context_sentence = context_sentence.split()
for ss in wn.synsets(ambiguous_word):
# If POS is specified.
if pos and ss.pos is not pos:
continue
lesk_dictionary = []
# Includes definition.
lesk_dictionary+= ss.definition.split()
# Includes lemma_names.
lesk_dictionary+= ss.lemma_names
# Optional: includes lemma_names of hypernyms and hyponyms.
if hyperhypo == True:
lesk_dictionary+= list(chain(*[i.lemma_names for i in ss.hypernyms()+ss.hyponyms()]))
if stem == True: # Matching exact words causes sparsity, so lets match stems.
lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
context_sentence = [ps.stem(i) for i in context_sentence]
overlaps = set(lesk_dictionary).intersection(context_sentence)
if len(overlaps) > max_overlaps:
lesk_sense = ss
max_overlaps = len(overlaps)
return lesk_sense
开发者ID:fieryfish,项目名称:wordSubstitutionTask,代码行数:30,代码来源:lesk.py
注:本文中的nltk.stem.PorterStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论