本文整理汇总了Python中nltk.PorterStemmer类的典型用法代码示例。如果您正苦于以下问题:Python PorterStemmer类的具体用法?Python PorterStemmer怎么用?Python PorterStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PorterStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: process_email
def process_email(filename):
f = open(filename, 'r')
text = f.read()
f.close()
text = text.lower()
#replaces html tags by space
text = re.sub(r'<[^<>]+>', ' ', text)
#replaces numbers by word number
text = re.sub(r'[0-9]+', 'number', text)
#replaces URLs by word httpaddr
text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
#replaces email addresses by word emailaddr
text = re.sub(r'[^\s][email protected][^\s]+', 'emailaddr', text)
#replaces dollar signs with word dollar
text = re.sub(r'[$]+', 'dollar', text)
#removes punctuation and non-words and separates words
words = re.split('[^a-z0-9]| ', text)
#removes nans
words = filter(lambda x: x!='', words)
#reduces words to their stems
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]
return words
开发者ID:aidad,项目名称:MachineLearning,代码行数:34,代码来源:spam_classifier.py
示例2: stemming
def stemming(line_list):
"""
Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data
Iterates over all terms in lines, stem them
Return: stemmed_list (list of strings(terms that stemmed))
"""
stemmed_list = []
stemmer = PorterStemmer()
for i, line in enumerate(line_list):
# linercase
line = line.lower()
# remove punctuation
# below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
# nopunct_line = ''.join([c for c in line
# if re.match("[a-z\-\' \n\t]", c)])
# this solve the problem above:
nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)
# tokenize
line_token = wt(nopunct_line)
# list to store stemmed terms
stemmed_line = []
for term in line_token:
term = stemmer.stem_word(term)
stemmed_line.append(term)
# back to sentence as a string
stemmed_sentence = ' '.join(stemmed_line)
stemmed_list.append(stemmed_sentence)
return stemmed_list
开发者ID:YuanhaoSun,项目名称:PPLearn,代码行数:30,代码来源:ml_feature_engineering.py
示例3: make_tags
def make_tags(title_string):
stemmer = PorterStemmer()
ret = []
for word in title_string.split():
if word not in stop_words:
ret.append(stemmer.stem_word(word.lower()))
return ret
开发者ID:abhijat,项目名称:RedditSearch,代码行数:7,代码来源:tagger.py
示例4: _log_likelihood
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
LL = 0
if answer_text is not '':
tokens = word_tokenize(str(answer_text), language='english')
porter_stemmer = PorterStemmer()
unique_wordcount = len(stemmed_vocabulary)
"""
per ogni w unica print_function words
Cw = conta w in answer_text
PwM = self.distrib_matrix[stemmer(w)]
unique_wordcount = len(tokenize(answer_text)
"""
for w in tokens:
_w = w.strip().lower()
Cw = 0
for _ in answer_text.split():
if _w == _.strip().lower():
Cw += 1
try:
w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
except AttributeError:
w_stem = porter_stemmer.stem(_w)
try:
PwM = distrib_matrix[w_stem]
except KeyError: # key error means frequency is equal to cutoff point 1
PwM = 1
LL += (Cw * log(float(PwM)))
try:
LL = "{0:.2f}".format(LL / float(unique_wordcount))
except ZeroDivisionError:
LL = 0
return LL
开发者ID:piercolella,项目名称:qa-scrapers,代码行数:35,代码来源:discretizer.py
示例5: openAndProcessingFiles
def openAndProcessingFiles(path,resultDict): # Main Function
for filename in os.listdir(os.getcwd()+path):
thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ]
textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
stop_words = set(stopwords.words('english'))
stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords
stemmer = PorterStemmer() #stemming
for eachWord in textAfterStopwordsRemovingList:
eachWord = stemmer.stem(eachWord)
storeToResultDict(eachWord,resultDict)
thisFile.close()
开发者ID:abryu,项目名称:WebMining-Python,代码行数:29,代码来源:v1.py
示例6: review_to_words
def review_to_words(raw_review, remove_stopwords = False):
# BeautifulSoup pulls data out of html file
# here it removes html tags and markups
text = BeautifulSoup(raw_review).get_text()
# replace numbers by word number
text=re.sub(r'[0-9]+','number',text)
# remove punctuations (they can be analyzed for better results)
text = re.sub(r'[^a-zA-Z]', ' ', text)
text = text.lower()
#make a list of words
words_list = text.split()
#download nltk text data sets, including stop words
#nltk.download()
if remove_stopwords:
# get stopwords, searching a set is faster than searching a list
stops = set(stopwords.words('english'))
# remove stopwords
words_list = [word for word in words_list if not word in stops]
# reduce words to their stems
stemmer=PorterStemmer()
words_list=[stemmer.stem(word) for word in words_list]
# return the list of words
return words_list
开发者ID:aidad,项目名称:MachineLearning,代码行数:29,代码来源:ReviewParsing.py
示例7: main
def main():
with open("sentiment.txt", 'r') as _file:
stemmer = PorterStemmer()
features = []
for words in _file:
feature = []
is_sentence = True
# 極性ラベルを除外
for word in words.split()[1:]:
try:
word = word.decode("utf-8")
if word not in [".", ",", ":", "?", "!"] \
and not has_stop_list(word):
feature.append(stemmer.stem(word))
except UnicodeDecodeError:
# 文字化けは無視する
is_sentence = False
break
if is_sentence:
features.append(feature)
return features
开发者ID:N4CL,项目名称:NLP100,代码行数:26,代码来源:ch8_72.py
示例8: stemm
def stemm(cls, tokens):
stemmer = PorterStemmer()
for i, t in enumerate(tokens):
tokens[i] = stemmer.stem(t)
return tokens
开发者ID:devjyotip,项目名称:twitter-analytics-dashboard,代码行数:7,代码来源:textprocessing.py
示例9: normalize
def normalize(word):
'''
normalize the the word for query or indexing
:param word: unicode string
:return: unicode string of the normalized ter
'''
porter = PorterStemmer()
return porter.stem(word) if word[0].isalpha() else ''
开发者ID:genehwung,项目名称:gutenberg_indexer,代码行数:8,代码来源:utilities.py
示例10: get_ngram_features
def get_ngram_features(self):
stemmer = PorterStemmer()
top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
self.ngram_features = dict(top_features + bottom_features + all_features)
开发者ID:jayhack,项目名称:automeme,代码行数:8,代码来源:Meme.py
示例11: __process_email
def __process_email(self, email_contents, vocab):
'''
Preprocess a the body of an email and returns a
list of word_indices.
Arguments:
email_contents (str): Email body.
vocab (dict): Words dictionary.
Return:
(str list): Tokenized email body after processing.
'''
# Lower case.
email_contents = email_contents.lower()
# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
email_contents = re.sub('<[^<>]+>', ' ', email_contents)
# Handle Numbers
# Look for one or more characters between 0-9
email_contents = re.sub('[0-9]+', 'number', email_contents)
# Handle URLS
# Look for strings starting with http:// or https://
email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
# Handle Email Addresses
# Look for strings with @ in the middle
email_contents = re.sub('[^\s][email protected][^\s]+', 'emailaddr', email_contents)
# Handle $ sign
email_contents = re.sub('[$]+', 'dollar', email_contents)
# Tokenize and also get rid of any punctuation
word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%',
email_contents)
# Remove empty string and skip the word if it is too short.
word_list = [s for s in word_list if s and len(s) > 1]
# Remove any non alphanumeric characters
word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list]
# Remove empty string and skip the word if it is too short.
word_list = [s for s in word_list if s and len(s) > 1]
# Stem the word
ps = PorterStemmer()
word_list = [ps.stem_word(s) for s in word_list]
word_indices = []
# Find index in vocab list.
for w in word_list:
if w in vocab:
word_indices.append(vocab[w])
return word_indices
开发者ID:farjan,项目名称:MachineLearning,代码行数:58,代码来源:ex6.py
示例12: processContent
def processContent(self, content):
stemmer = PorterStemmer()
tokens = word_tokenize(content)
tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
tokens = [stemmer.stem(token.lower()) for token in tokens]
tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
tokens = [str(token) for token in tokens]
bow = FreqDist(tokens)
return(bow)
开发者ID:danmerl,项目名称:jobbot,代码行数:9,代码来源:job_spider.py
示例13: getStemmedWords
def getStemmedWords(self,html):
stemmed_words=[]
#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
for token in html:
stemmed_words.append(stemmer.stem_word(token))
return ' '.join(stemmed_words)
开发者ID:usc-isi-i2,项目名称:dig-classifier,代码行数:9,代码来源:preprocessor.py
示例14: main
def main():
# Use file defined by BIOC_IN as default if no other provided
bioc_in = BIOC_IN
if len(sys.argv) >= 2:
bioc_in = sys.argv[1]
# A BioCReader object is put in place to hold the example BioC XML
# document
bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
# A BioCWRiter object is prepared to write out the annotated data
bioc_writer = BioCWriter(BIOC_OUT)
# The NLTK porter stemmer is used for stemming
stemmer = PorterStemmer()
# The example input file given above (by BIOC_IN) is fed into
# a BioCReader object; validation is done by the BioC DTD
bioc_reader.read()
# Pass over basic data
bioc_writer.collection = bioc_reader.collection
# Get documents to manipulate
documents = bioc_writer.collection.documents
# Go through each document
annotation_id = 0
for document in documents:
# Go through each passage of the document
for passage in document:
# Stem all the tokens found
stems = [stemmer.stem(token) for
token in wordpunct_tokenize(passage.text)]
# Add an anotation showing the stemmed version, in the
# given order
for stem in stems:
annotation_id += 1
# For each token an annotation is created, providing
# the surface form of a 'stemmed token'.
# (The annotations are collectively added following
# a document passage with a <text> tag.)
bioc_annotation = BioCAnnotation()
bioc_annotation.text = stem
bioc_annotation.id = str(annotation_id)
bioc_annotation.put_infon('surface form',
'stemmed token')
passage.add_annotation(bioc_annotation)
# Print file to screen w/o trailing newline
# (Can be redirected into a file, e. g output_bioc.xml)
sys.stdout.write(str(bioc_writer))
# Write to disk
bioc_writer.write()
开发者ID:2mh,项目名称:PyBioC,代码行数:57,代码来源:stemmer.py
示例15: stemmingword
def stemmingword(word_list, stemtype='porter'):
if stemtype == 'porter':
stemengine = PorterStemmer()
else:
stemengine = LancasterStemmer()
try:
filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
except UnicodeDecodeError, e:
print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
开发者ID:ARGHZ,项目名称:ClassifTweets,代码行数:9,代码来源:execute_xperiment.py
示例16: getPosWords
def getPosWords():
stemmer = PorterStemmer()
stemmedPosTokens = []
pos = open(r'pos.txt').read()
pos = re.sub("\d", "", pos)
posWords = nltk.word_tokenize(pos)
for posWord in posWords:
stemmedPosWord = stemmer.stem(posWord)
stemmedPosTokens.append(stemmedPosWord.lower())
return stemmedPosTokens
开发者ID:ffmaer2,项目名称:newstrader,代码行数:10,代码来源:Sentiment.py
示例17: preprocess
def preprocess( result ):
words = removePunct(result.title)
words += " "
words += removePunct(result.snippet)
result.tokens = nltk.word_tokenize(words)
for tok in result.tokens:
if tok not in STOPS:
tok = PorterStemmer().stem(tok.decode('utf-8'))
tok = tok.lower().encode('utf-8')
return result
开发者ID:jaredhancock31,项目名称:GoogleRelevanceRanker,代码行数:10,代码来源:ranker.py
示例18: update_Porter_stemming
def update_Porter_stemming(): #We use stems occasionally.
"Updating stems from Porter algorithm..."
from nltk import PorterStemmer
stemmer = PorterStemmer()
cursor.execute("""SELECT word FROM words WHERE wordid <= 750000 and stem is null;""")
words = cursor.fetchall()
for local in words:
word = ''.join(local)
if re.match("^[A-Za-z]+$",word):
query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';"""
z = cursor.execute(query)
开发者ID:nchernia,项目名称:Presidio,代码行数:11,代码来源:Pyfunctions.py
示例19: getNegWords
def getNegWords():
stemmer = PorterStemmer()
stemmedNegTokens = []
neg = open(r'neg.txt').read()
neg = re.sub("\d", "", neg)
negWords = nltk.word_tokenize(neg)
for negWord in negWords:
stemmedNegWord = stemmer.stem(negWord)
stemmedNegTokens.append(stemmedNegWord.lower())
return stemmedNegTokens
开发者ID:ffmaer2,项目名称:newstrader,代码行数:11,代码来源:Sentiment.py
示例20: getUncertainWords
def getUncertainWords():
stemmer = PorterStemmer()
stemmedUnTokens = []
un = open(r'uncertain.txt').read()
un = re.sub("\d", "", un)
unWords = nltk.word_tokenize(un)
for unWord in unWords:
stemmedUnWord = stemmer.stem(unWord)
stemmedUnTokens.append(stemmedUnWord.lower())
return stemmedUnTokens
开发者ID:ffmaer2,项目名称:newstrader,代码行数:11,代码来源:Sentiment.py
注:本文中的nltk.PorterStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论