本文整理汇总了Python中nltk.stem.SnowballStemmer类的典型用法代码示例。如果您正苦于以下问题:Python SnowballStemmer类的具体用法?Python SnowballStemmer怎么用?Python SnowballStemmer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SnowballStemmer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: des_extrect
def des_extrect():
filename_list = []
file_stopwords = file('stopwords.txt', "r")
stopwords = [line.strip() for line in file_stopwords.readlines()]
for file_name in os.listdir(DESCRIPTION_DIR):
filename_list.append(file_name)
for filename in filename_list:
path = os.path.join(DESCRIPTION_DIR, filename)
fr = file(path, 'r')
fw = file(filename+'.des', 'w')
soup = BeautifulSoup(fr.read())
docs = soup.findAll('doc')
for doc in docs:
content = str(doc['title'] + doc.snippet.text)
content = re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content)
stemmer = SnowballStemmer('english')
content = content.split()
pro_content = ''
for w in content:
w = stemmer.stem(w)
#去停用词
if w not in stopwords:
pro_content += w + ' '
fw.write(doc['rank'] + ' ' +pro_content+'\n')
fw.close()
fr.close()
开发者ID:delili,项目名称:WePS-2-Clustering,代码行数:26,代码来源:procress.py
示例2: text_token_data_generator
def text_token_data_generator():
global id_text_index_map
translation_table = string.maketrans(
string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase
)
snowball_stemmer = SnowballStemmer("english")
for f in glob.glob("json/text/*.json"):
for line in open(f).readlines():
extract_row = json.loads(line)
id_text_index_map[extract_row["file_id"]] = len(id_text_index_map)
visible_text = extract_row["visible_text"].encode("ascii", "ignore")
visible_text = visible_text.translate(translation_table)
visible_text = [
snowball_stemmer.stem(word)
for word in visible_text.split()
if word not in ENGLISH_STOP_WORDS and len(word) > 1
]
title = extract_row["title"].encode("ascii", "ignore")
title = title.translate(translation_table)
title = [
"t^{}".format(snowball_stemmer.stem(word))
for word in title.split()
if word not in ENGLISH_STOP_WORDS and len(word) > 1
]
visible_text.extend(title)
yield " ".join(visible_text)
开发者ID:daxiongshu,项目名称:Dato-Sponsored-Page-Prediction,代码行数:26,代码来源:js2sp_converter.py
示例3: text_to_wordlist
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.lower().split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
#Remove Special Characters
text=special_character_removal.sub('',text)
#Replace Numbers
text=replace_numbers.sub('n',text)
# Optionally, shorten words to their stems
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return(text)
开发者ID:hitboys,项目名称:Toxic-Comment-Classification-Challenge,代码行数:28,代码来源:simple_lstm.py
示例4: ModelBuilder
class ModelBuilder():
def __init__(self):
self.model = {}
self.stemmer = SnowballStemmer('english')
def build(self):
with open('data/candidate_synonyms.txt') as f:
all_words = f.read().split('\n')
for words in all_words:
if words:
word, similar = words.split(',')
word, similar = self.stemmer.stem(word), self.stemmer.stem(similar)
if word not in self.model: self.model[word] = {}
self.model[word][similar] = 1
return self
def condense(self):
condensed_model = {}
for word, similars in self.model.items():
for similar in similars:
if self.model.get(similar, {}).has_key(word):
if condensed_model.has_key(word):
condensed_model[word].append(similar)
else:
condensed_model[word] = [similar]
self.model = condensed_model
return self
开发者ID:jayeshsidhwani,项目名称:simset_model,代码行数:28,代码来源:model_builder.py
示例5: frequency_analysis
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50):
recipes = []
with open(input_path, 'r') as f:
for i, line in enumerate(f):
if line == '\n':
break
if i == 0:
continue # skip header
fields = line.split('\t')
recipes.append(fields[1].replace("\n", ""))
recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes))
recipe_words = re.split("\s+", recipe_text)
stemmer = SnowballStemmer("english")
recipe_stems = [stemmer.stem(w) for w in recipe_words]
if stopwords is not None:
recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords])
top_words = Counter(recipe_stems).most_common(n_most_common)
# write to a file
# do a second pass of the recipe to determine how many of the documents the term is in
freq_table = open(output_path, 'wb')
for elt in top_words:
doc_freq = sum([elt[0] in recipe for recipe in recipes])
freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n')
freq_table.close()
开发者ID:robert-giaquinto,项目名称:sentence_boundary_detection,代码行数:25,代码来源:frequency_analysis.py
示例6: norm_corpus
def norm_corpus(document_list):
norm_doc_list = []
# lowercase
document_list = [word.lower() for word in document_list]
# remove symbols in text
symbols = ",.?!"
for sym in symbols:
document_list = [word.replace(sym,'') for word in document_list]
# loop through each string i.e. review in the column
for doc in document_list:
doc = nltk.word_tokenize(doc)
# remove stopwords
doc = [word for word in doc if word not in stopwords.words('english')]
# stem words
stemmer = SnowballStemmer("english")
doc = [stemmer.stem(word) for word in doc]
# make tokenised text one string
norm_doc = " ".join(doc)
norm_doc_list.append(norm_doc)
return norm_doc_list
开发者ID:mariaathena,项目名称:yelp_data_challenge,代码行数:29,代码来源:old_parse_tip_data.py
示例7: stemmed
def stemmed(text,language):
stemmer= SnowballStemmer(language)
tas=text.split()
text=""
for word in tas:
text=" ".join((text,stemmer.stem(word)))
return text.lstrip()
开发者ID:bobvdvelde,项目名称:inca,代码行数:7,代码来源:analysis.py
示例8: procesar
def procesar(request, identificador):
lmtzr = WordNetLemmatizer()
d = Documento.objects.get(id=identificador)
#nltk.corpus.cess_esp.words()
tokens = nltk.word_tokenize(d.contenido.replace('.', ' . '))
#print tokens
#scentence = d.contenido
#scentence = scentence.lower()
words = tokens
spanish_stemmer = SnowballStemmer('spanish')
#This is the simple way to remove stop words
important_words=[]
for word in words:
if word not in stopwords.words('spanish'):
important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)])
return render_to_response('templates/documentoProcesado.html',
{
'original': d.contenido,
'tokens': tokens,
'important_words' : important_words,
#'pos_tags': pos_tags,
#'ne_chunks': ne_chunks.subtrees(),
})
开发者ID:alexanderalfaro,项目名称:pqr,代码行数:34,代码来源:views.py
示例9: normalized_token
def normalized_token(token):
"""
Use stemmer to normalize the token.
建图时调用该函数,而不是在file_text改变词形的存储
"""
stemmer = SnowballStemmer("english")
return stemmer.stem(token.lower())
开发者ID:carlsplace,项目名称:KeyphraseExtraction,代码行数:7,代码来源:ugly.py
示例10: preprocessing
def preprocessing(doc): #stop word as optional
x = re.sub("[^a-zA-Z]", " ", doc) #only words
x = x.lower().split()
stemmer = SnowballStemmer("english") # use snowball
stops = set(stopwords.words("english")) # set is faster than list
x = [stemmer.stem(word) for word in x if word not in stops]
return(x)
开发者ID:Kiminaka,项目名称:topic_model_intrusion_eval,代码行数:7,代码来源:evaluation_function.py
示例11: stemWordMatch2
def stemWordMatch2(question,sentence):
question_tokens = set(nltk.word_tokenize(question))
sentence_tokens=set(nltk.word_tokenize(sentence))
# Finding the match between two words from the same root using Lancaster Stemmizer
'''stemmer=LancasterStemmer()
for i in sentence_tokens:
stem_words_list.append(stemmer.stem(i))
for i in question_tokens:
question_words_list.append(stemmer.stem(i))
#print 'Stem word list',stem_words_list
#print 'Question word list', question_words_list
stem_count=0
for i in stem_words_list:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_words_list]:
#print 'Question word is',x
#print 'Sentence word stem is :',i
#print 'Match'
stem_count=stem_count+6
stem_word_match_counter.append(count)'''
stem_word_match_counter=[]
stem_words_list=[]
question_words_list=[]
# Finding the match between two words from the same root using Snowball Stemmizer
snowball_stemmer = SnowballStemmer('english')
for i in sentence_tokens:
stem_words_list.append(snowball_stemmer.stem(i))
for i in question_tokens:
question_words_list.append(snowball_stemmer.stem(i))
#print 'Stem word list',stem_words_list
#print 'Question word list', question_words_list
stem_count=0
for i in stem_words_list:
#Finding the exact word match
if i.lower() in [x.lower() for x in question_words_list]:
#print 'Question word is',x
#print 'Sentence word stem is :',i
#print 'Match'
stem_count=stem_count+6
#print 'Stem word count match score is :', stem_count
return stem_count
开发者ID:AnirudhNarasimhamurthy,项目名称:Natural-Language-Processing-Fall-2015,代码行数:57,代码来源:WM.py
示例12: preprocess_tweets
def preprocess_tweets(tweets):
stemmer = SnowballStemmer("english")
stop = set(stopwords.words("english"))
tweet_texts = [ " ".join(stemmer.stem(i) if len(i) > 1 else i
for i in ("".join(c for c in word if c not in string.punctuation)
for word in tweet["text"].lower().split())
if i and i not in stop)
for tweet in tweets ]
return list(set(tweet_texts))
开发者ID:jiwu14,项目名称:TweetAnalyzer,代码行数:9,代码来源:TweetAnalyzer.py
示例13: stem
def stem(self, content):
import re
original_string = content
new_content = re.sub('[^a-zA-Z0-9\n\.]', ' ', original_string)
words = new_content.split()
stemmer = SnowballStemmer('english')
singles = [stemmer.stem(wordsa) for wordsa in words]
return (' '.join(singles))
开发者ID:HarshSharma12,项目名称:fun-scripts,代码行数:9,代码来源:summary_tool_stemmed.py
示例14: __call__
def __call__(self, doc ):
snowball_stemmer = SnowballStemmer('english')
#tokenizer = RegexpTokenizer(r'\w+')
#words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)]
words=[snowball_stemmer.stem(t) for t in word_tokenize(doc)]
stop_words=set(stopwords.words('english'))
stop_words.update(self.mystops)
stop_words=list(stop_words)
return [i.lower() for i in words if i not in stop_words]
开发者ID:joswinkj,项目名称:question_answering,代码行数:9,代码来源:Tokenizers.py
示例15: stemLem
def stemLem(w):
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
#stemmer = PorterStemmer()
lem = lemmatizer.lemmatize(w)
if len(w) > len(lem):
return lem
return stemmer.stem(w)
开发者ID:NSindre,项目名称:master-general,代码行数:9,代码来源:lemmatizeGeneralTerm.py
示例16: stemmed_top_user_words
def stemmed_top_user_words(usertxt, num=10):
wl_usertxt = word_tokenize(usertxt.lower())
num = min(num, len(wl_usertxt))
snowball_stemmer = SnowballStemmer("english")
stemmed_fl_usertxt = [snowball_stemmer.stem(w) for w in wl_usertxt if (len(w)>4 and w not in ewl)]
fd_user_ls = [w[0] for w in FreqDist(Text(stemmed_fl_usertxt)).most_common(num)]
return fd_user_ls
开发者ID:Reinaesaya,项目名称:munchee,代码行数:9,代码来源:text_mine.py
示例17: main
def main(input_file, dbname):
"""
Main function. Connects to a database and reads a\
CSV with the arousal and valence. Uses the sentiment \
library to compute the sentiment of a new.
:param input_file: the ANEW file
:param dbname: the name of the database
"""
# read ANEW file
if not os.path.exists(input_file):
logging.error('File %s does not exist', input_file)
sys.exit(1)
else:
csvfile = open(input_file, 'r')
reader = csv.reader(csvfile, delimiter=',')
reader.next() # skip headers
stemmer = SnowballStemmer('spanish')
anew = dict([(stemmer.stem(unicode(row[2], 'utf-8')),
{'valence': float(row[3]),
'arousal': float(row[5])}) for row in reader])
couch = couchdb.Server()
database = couch[dbname]
logging.info('Established connection with the db %s', dbname)
for element in database:
doc = database.get(element)
comments = " ".join([comment['cleaned_summary']
for comment in doc['comments']])
description = " ".join([database.get(element)['title'],
doc['description']])
sentiment_comments = get_sentiment(anew, comments)
sentiment_description = get_sentiment(anew, description)
if sentiment_comments is not None and sentiment_description is not None:
logging.info('%s val: %.2f - %.2f aro: %.2f - %.2f : %s',
doc.id, sentiment_comments[0],
sentiment_description[0],
sentiment_comments[1],
sentiment_description[1],
doc['title'])
doc['sentiments'] = {'comments':
{'valence': sentiment_comments[0],
'arousal': sentiment_comments[1]},
'description':
{'valence': sentiment_description[0],
'arousal': sentiment_description[1]}}
database.save(doc)
else:
logging.warn('%s could not be analyzed. skiping ...',
database.get(element)['title'])
开发者ID:albertfdp,项目名称:dtu-data-mining,代码行数:57,代码来源:sentiment.py
示例18: stem_text
def stem_text(self):
'''
Perform stemming
'''
stemmer = SnowballStemmer("english")
stemmed_sents = []
for sent in self.tok_text:
stemmed_sents.append([stemmer.stem(tok) for tok in sent])
self.stem_text = stemmed_sents
开发者ID:emgrol,项目名称:false_review_detection,代码行数:10,代码来源:preprocess.py
示例19: prepare_request
def prepare_request(request, synonyms = False):
#request = translate(request)
request = re.sub(r"(\n)", " ", request.lower())
request = re.sub(r"(-\n)", "", request)
request = re.split("[^a-z0-9]", request)
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
if synonyms == True:
request = add_synonyms([word for word in request if word not in stop_words])
request = [stemmer.stem(word) for word in request if (word not in stop_words) & (len(word) > 1) & (len(word) < 20)]
return ' '.join(request)
开发者ID:Xsardas1000,项目名称:Search,代码行数:11,代码来源:vec_search.py
示例20: tokenize
def tokenize(resultList1):
entrada=[]
tokens = word_tokenize(resultList1)
filtered_words = [w for w in tokens if not w in stopwords.words('spanish')]
stemmer = SnowballStemmer('spanish')
for i in filtered_words:
stri = unicode(i,errors='replace')
entrada.append(stemmer.stem(stri))
return entrada
开发者ID:josearcosaneas,项目名称:RepositorioPara-la-entrega-del-TFG,代码行数:11,代码来源:ClasificadorResumen.py
注:本文中的nltk.stem.SnowballStemmer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论