本文整理汇总了Python中snowballstemmer.stemmer函数的典型用法代码示例。如果您正苦于以下问题:Python stemmer函数的具体用法?Python stemmer怎么用?Python stemmer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了stemmer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: clean
def clean(text, stemmer='snowball'):
"""Normalize, split, and clean text
Parameters:
-----------
text : str
Block of text to clean and prepare.
stemmer : str, opt
Stemmer to use: [snowball, five, simple]
Returns:
--------
text : str
Cleaned and prepared text block.
"""
if not stemmer in ['snowball', 'five', 'simple', 'none']:
raise ValueError("Stemmer choice not available.")
text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
text = text.split()
if stemmer == 'five':
text = [five_stemmer(item) for item in text]
elif stemmer == 'snowball':
stemmer = snowballstemmer.stemmer('english');
text = stemmer.stemWords(text)
elif stemmer == 'simple':
text = [simple_stem(item) for item in text]
else:
pass
text = [item for item in text if not item in STOP_WORDS]
return text
开发者ID:justincely,项目名称:classwork,代码行数:35,代码来源:corpus_stats.py
示例2: preprocess_document
def preprocess_document(data):
# Step 1: strip punctuation
data = data.lower()
punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
, '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-']
for punc in punctuation:
data = data.replace(punc, '')
# Step 2: tokenize
data = list(nltk.word_tokenize(data))
# Step 3: strip stopwords
stop = set(stopwords.words('english'))
extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
stop.update(extra_stopwords)
stop.update(list(string.ascii_lowercase)) # remove all single letters
data = [i for i in data if i not in stop] # remove stopwords and sort result
# Step 4: stemming
stemmer = snowballstemmer.stemmer('english')
data = stemmer.stemWords(data)
# Step 5: remove words not in NLTK english corpus
words = set(nltk.corpus.words.words())
for w in data:
if w not in words:
data.remove(w)
开发者ID:ZanW,项目名称:Python,代码行数:27,代码来源:News_pre_l.py
示例3: stemming
def stemming(lang, input, output, encoding, pretty):
result = []
stemmer = snowballstemmer.stemmer(lang)
for original in codecs.open(input, "r", encoding).readlines():
original = original.strip()
# Convert only ASCII-letters to lowercase, to match C behavior
original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
stemmed = stemmer.stemWord(original)
if result:
result.append('\n')
if pretty == 0:
if stemmed != "":
result.append(stemmed)
elif pretty == 1:
result.append(original, " -> ", stemmed)
elif pretty == 2:
result.append(original)
if len(original) < 30:
result.append(" " * (30 - len(original)))
else:
result.append("\n")
result.append(" " * 30)
result.append(stemmed)
outfile = codecs.open(output, "w", encoding)
outfile.write(''.join(result) + '\n')
outfile.close()
开发者ID:xjzhou,项目名称:snowball,代码行数:26,代码来源:stemwords.py
示例4: getHighlightingsVariables
def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
stemmer = snowballstemmer.stemmer("german")
#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
for i in range(0, len(article)):
for j in range(0, len(article[i])):
article[i][j] = article[i][j].split(" ");
for k in range(0, len(article[i][j])):
#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
article[i][j][k]=stemmer.stemWord(article[i][j][k])
for i in range(0, len(variable_keywords)):
#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
variable_keywords[i]=stemmer.stemWord(variable_keywords[i])
highlight = []
for i in range(0, len(article)):
highlight_article = []
for j in range(0, len(article[i])):
highlight_variables = []
for k in range(0, len(variable_keywords)):
highlight_variables.append(random.random())
highlight_article.append(highlight_variables)
highlight.append(highlight_article)
return highlight
开发者ID:Institute-Web-Science-and-Technologies,项目名称:westcat,代码行数:31,代码来源:Highlighter_Articles.py
示例5: turkish
def turkish(sent):
# No turkish stemmer in NLTK
stem = snowballstemmer.stemmer('turkish')
stop = stopwords.words('turkish')
tx = word_tokenize(sent)
mx = stem.stemWords(tx)
px = [x for x in mx if x not in stop]
return px
开发者ID:Jiannan28,项目名称:stemtokstop,代码行数:8,代码来源:stemtokstop.py
示例6: __init__
def __init__(self, language=None):
"""Create a new highlighter for the specified language.
"""
if language:
self.stem = snowballstemmer.stemmer(language)
else:
self.stem = NoStem()
开发者ID:flaxsearch,项目名称:highlighter,代码行数:8,代码来源:highlight.py
示例7: aplicarStemmer
def aplicarStemmer(pDictPalabrasArchivos):
print("aplicando stemming...")
dictRaices = {}
stemmer = snowballstemmer.stemmer("spanish")
for docId, palabras in pDictPalabrasArchivos.items():
raices = stemmer.stemWords(palabras)
dictRaices[docId] = raices
## archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
return dictRaices
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:9,代码来源:archivo_invertido.py
示例8: __init__
def __init__(self, xml):
self.dest = xml.get("dest")
if self.dest is None:
raise ValueError()
self.verbose = xml.get("verbose")
if self.verbose is None:
self.verbose = False
else:
self.verbose = True
self.stemmer = snowballstemmer.stemmer('english')
开发者ID:Sentimentron,项目名称:Nebraska-public,代码行数:10,代码来源:stemmer.py
示例9: aplicarStemmerConsulta
def aplicarStemmerConsulta(pLista):
#print(pLista)
print("aplicando stemming...")
lista = []
stemmer = snowballstemmer.stemmer('spanish')
for i in pLista:
#print(i[0])
raiz = stemmer.stemWords([i[0]])[0]
lista.append([raiz,i[1]])
#print(i[0])
#print(lista)
return lista
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:12,代码来源:consultas.py
示例10: __init__
def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
"""
Create a vocabulary which is a mapping from bucket names to lists of
synonyms that fall into their bucket. Stopwords is a list of words that
are ignored for the vocabulary and defaults to a built-in english
stopword list.
"""
self.stopwords = stopwords
self.stemmer = snowballstemmer.stemmer("english")
self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
self.logging = logging
if samples:
self._generate_vocabulary(samples, limit)
开发者ID:janukobytsch,项目名称:wikimedia-image-classification,代码行数:13,代码来源:words.py
示例11: create_search_terms
def create_search_terms(string_terms):
''' Creates search terms by stemming every word within the parameter passed.
Returns all search terms in one string separated by space'''
stemmer = snowballstemmer.stemmer('english')
terms = stemmer.stemWords(string_terms.split())
search_term = list()
for term in terms:
lower_term = term.lower()
if not lower_term in _STOP_WORDS:
search_term.append(lower_term)
return " ".join(search_term)
开发者ID:Trekafe,项目名称:trekafe_web,代码行数:13,代码来源:search_terms.py
示例12: search_result
def search_result(request):
query = request.POST.get('query')
q_words = query.split()
stemmed_words = []
for word in q_words:
lng = detect(word)
if lng in LANGUAGES:
lng = LANGUAGES[lng]
stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
else:
stemmed_words.append(word)
return render(request, 'searchres/search_result.html', {})
开发者ID:alehat,项目名称:searchengine,代码行数:13,代码来源:views.py
示例13: getPalabras
def getPalabras():
file = "dicc.txt"
arc = open(file, 'r')
stemmer = snowballstemmer.stemmer('spanish');
words = {}
for i in arc:
i = i.rstrip()
i = stemmer.stemWord(i)
words[i] = "word"
for i in words.items():
print i
print len(words)
开发者ID:andoniVT,项目名称:OpinionMiningProject,代码行数:17,代码来源:Utils.py
示例14: get_coursed_and_create_matrix
def get_coursed_and_create_matrix():
results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
print new_matrix.matrix.shape[0] != len(results)
if new_matrix.matrix.shape[0] != len(results):
all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]
MatrixEdxCoursesId.objects.all().delete()
map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)
stemmer = snowballstemmer.stemmer("english")
courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]
vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
matrix = vect.fit_transform(courses_stem)
new_matrix.matrix = matrix
new_matrix.save()
开发者ID:vz10,项目名称:edx_telegram_bot,代码行数:17,代码来源:prediction.py
示例15: identify_language
def identify_language(self, text):
self.lang = lang_mapping[langid.classify(text)[0]]
if self.debug: print "LANG", self.lang#, "stemmer", self.stem
if self.lang == "greek":
from stemmers.greek import stem, stopwords
self.stem = stem
self.legal_token = partial(self.legal_token, exclude_list=stopwords)
elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
import snowballstemmer
from stemmers.turkish import stopwords
self.stem = snowballstemmer.stemmer("turkish").stemWord
self.legal_token = partial(self.legal_token, exclude_list=stopwords)
else:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
self.stem = SnowballStemmer(self.lang).stem
self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
开发者ID:hymloth,项目名称:pyredise,代码行数:18,代码来源:index_base.py
示例16: checkon
def checkon(fn, o):
if not os.path.exists(fn) or os.path.isdir(fn):
fn = fn + '.json'
if 'title' not in o.json.keys():
if verbose:
print('No title in', o.getKey())
return 1 # no title
# check for a different language - to avoid stemming altogether
if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
if 'stemmed' in o.json.keys():
# if stemmed before marked foreign, remove this info
del o.json['stemmed']
F = open(fn, 'w')
F.write(o.getJSON())
F.close()
return 2
else:
return 0
changed = False
### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
stemmer = snowballstemmer.stemmer('english').stemWords
### disregarded variant: snowballstemmer porter - considered outdated
# stemmer = snowballstemmer.stemmer('porter').stemWords
### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
### disregarded variant: nltk - worse on verbs ending with -ze
# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
### end variants
stemmed = stemmer(string2words(o.get('title')))
if '' in stemmed:
print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
print(string2words(o.get('title')))
print(stemmer(string2words(o.get('title'))))
ALLSTEMS.update(stemmed)
if o.get('stemmed') != stemmed:
o.json['stemmed'] = stemmed
changed = True
if changed:
F = open(fn, 'w')
F.write(o.getJSON())
F.close()
return 2
else:
return 0
开发者ID:bibtex,项目名称:bibsleigh,代码行数:44,代码来源:refine-stem.py
示例17: main
def main():
argv = sys.argv
if len(argv) < 2:
usage()
return
algorithm = 'english'
if len(argv) > 2:
algorithm = argv[1]
argv = argv[2:]
else:
argv = argv[1:]
stemmer = snowballstemmer.stemmer(algorithm)
splitter = re.compile(r"[\s\.-]")
for arg in argv:
for word in splitter.split(arg):
if word == '':
continue
original = word.lower()
print(original + " -> " + stemmer.stemWord(original))
开发者ID:Marslo,项目名称:VimConfig,代码行数:19,代码来源:testapp.py
示例18: preprocess_features
def preprocess_features(dataframe):
# get the count of how many times each product appears, may correlate
product_counts = pandas.DataFrame(pandas.Series(dataframe.groupby(["product_uid"]).size(), name="product_count"))
dataframe = pandas.merge(dataframe, product_counts, left_on="product_uid", right_index=True, how="left")
dataframe = experiment_gensim(dataframe)
dataframe["search_length"] = dataframe.search_term.str.len()
dataframe["id_bins"] = pandas.cut(dataframe.id, 20, labels=False)
# word distribution metrics
dataframe["title_unigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1), axis=1)
dataframe["title_bigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2), axis=1)
dataframe["desc_unigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1), axis=1)
dataframe["desc_bigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2), axis=1)
dataframe["brand_unigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(1), axis=1)
dataframe["brand_bigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(2), axis=1)
# stemmed unigrams
stemmer = snowballstemmer.stemmer("english")
dataframe["title_unigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
dataframe["desc_unigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
dataframe["title_bigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)
dataframe["desc_bigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)
# edit distance metrics (slow)
dataframe["title_word_edit_distance"] = dataframe[["search_term", "product_title"]].apply(word_edit_distance, axis=1)
dataframe["title_char_edit_distance"] = dataframe[["search_term", "product_title"]].apply(char_edit_distance, axis=1)
# dataframe["desc_word_edit_distance"] = dataframe[["search_term", "product_description"]].apply(word_edit_distance, axis=1)
# dataframe["desc_char_edit_distance"] = dataframe[["search_term", "product_description"]].apply(char_edit_distance, axis=1)
dataframe = dataframe.drop(["product_title", "search_term", "id", "product_description", "brand_name"], axis=1)
print(dataframe.describe())
return dataframe
开发者ID:ktrnka,项目名称:kaggle-home-depot,代码行数:40,代码来源:experiment.py
示例19: textrank
def textrank(text, hdr):
sent_tokenizer = PunktSentenceTokenizer()
sentences = sent_tokenizer.tokenize(text)
word_tokenizer = RegexpTokenizer(r'\w+')
# finding out the most possible language of the text
lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]
stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower()))
for sentence in sentences]
pairs = combinations(range(len(sentences)), 2)
scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
scores = filter(lambda x: x[2], scores)
g = nx.Graph()
g.add_weighted_edges_from(scores)
pr = nx.pagerank(g)
return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
key=lambda x: pr[x[0]], reverse=True), lang_code
开发者ID:Why-Not-Sky,项目名称:wanish,代码行数:22,代码来源:summarizer.py
示例20: seeker_highlight
def seeker_highlight(text, query, algorithm='english'):
try:
import snowballstemmer
stemmer = snowballstemmer.stemmer(algorithm)
stemWord = stemmer.stemWord
stemWords = stemmer.stemWords
except:
stemWord = lambda word: word
stemWords = lambda words: words
phrases = _phrase_re.findall(query)
keywords = [w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w]
highlight = set(stemWords(keywords))
text = seeker_format(text)
for phrase in phrases:
text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I)
parts = []
for word in re.split(r'(\W+)', text):
if stemWord(word.lower()) in highlight:
parts.append('<em>%s</em>' % word)
else:
parts.append(word)
return ''.join(parts)
开发者ID:lefcourn,项目名称:django-seeker,代码行数:22,代码来源:seeker.py
注:本文中的snowballstemmer.stemmer函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论