• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python snowballstemmer.stemmer函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中snowballstemmer.stemmer函数的典型用法代码示例。如果您正苦于以下问题:Python stemmer函数的具体用法?Python stemmer怎么用?Python stemmer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了stemmer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: clean

def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english');
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text
开发者ID:justincely,项目名称:classwork,代码行数:35,代码来源:corpus_stats.py


示例2: preprocess_document

def preprocess_document(data):
    # Step 1: strip punctuation
    data = data.lower()
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
    , '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-'] 
    for punc in punctuation:
        data = data.replace(punc, '')
        
    # Step 2: tokenize 
    data = list(nltk.word_tokenize(data))
    
    # Step 3: strip stopwords
    stop = set(stopwords.words('english'))
    extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
    stop.update(extra_stopwords)
    stop.update(list(string.ascii_lowercase)) # remove all single letters
    data = [i for i in data if i not in stop] # remove stopwords and sort result
    
    # Step 4: stemming
    stemmer = snowballstemmer.stemmer('english') 
    data = stemmer.stemWords(data)
    
    # Step 5: remove words not in NLTK english corpus
    words = set(nltk.corpus.words.words())
     for w in data:
        if w not in words:
            data.remove(w)
开发者ID:ZanW,项目名称:Python,代码行数:27,代码来源:News_pre_l.py


示例3: stemming

def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()
开发者ID:xjzhou,项目名称:snowball,代码行数:26,代码来源:stemwords.py


示例4: getHighlightingsVariables

	def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
		stemmer = snowballstemmer.stemmer("german")
		#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
		for i in range(0, len(article)):
			for j in range(0, len(article[i])):
  				article[i][j] = article[i][j].split(" ");
				for k in range(0, len(article[i][j])):
					#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
					article[i][j][k]=stemmer.stemWord(article[i][j][k])


		for i in range(0, len(variable_keywords)):
			#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
			variable_keywords[i]=stemmer.stemWord(variable_keywords[i])

		highlight = []

		for i in range(0, len(article)):
			highlight_article = []
	
			for j in range(0, len(article[i])):
				highlight_variables = []
				for k in range(0, len(variable_keywords)):
	  				highlight_variables.append(random.random())
				highlight_article.append(highlight_variables)

			highlight.append(highlight_article)
			


	 	return highlight
开发者ID:Institute-Web-Science-and-Technologies,项目名称:westcat,代码行数:31,代码来源:Highlighter_Articles.py


示例5: turkish

def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx  = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px
开发者ID:Jiannan28,项目名称:stemtokstop,代码行数:8,代码来源:stemtokstop.py


示例6: __init__

 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()
开发者ID:flaxsearch,项目名称:highlighter,代码行数:8,代码来源:highlight.py


示例7: aplicarStemmer

def aplicarStemmer(pDictPalabrasArchivos):
    print("aplicando stemming...")
    dictRaices = {}
    stemmer = snowballstemmer.stemmer("spanish")
    for docId, palabras in pDictPalabrasArchivos.items():
        raices = stemmer.stemWords(palabras)
        dictRaices[docId] = raices
    ##    archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
    return dictRaices
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:9,代码来源:archivo_invertido.py


示例8: __init__

 def __init__(self, xml):
     self.dest = xml.get("dest")
     if self.dest is None:
         raise ValueError()
     self.verbose = xml.get("verbose")
     if self.verbose is None:
         self.verbose = False
     else:
         self.verbose = True
     self.stemmer =  snowballstemmer.stemmer('english')
开发者ID:Sentimentron,项目名称:Nebraska-public,代码行数:10,代码来源:stemmer.py


示例9: aplicarStemmerConsulta

def aplicarStemmerConsulta(pLista):
    #print(pLista)
    print("aplicando stemming...")
    lista = []
    stemmer = snowballstemmer.stemmer('spanish')
    for i in pLista:
        #print(i[0])
        raiz = stemmer.stemWords([i[0]])[0]
        lista.append([raiz,i[1]])
        #print(i[0])
    #print(lista)
    return lista
开发者ID:201265615,项目名称:TP2_RIT_II15_PY,代码行数:12,代码来源:consultas.py


示例10: __init__

 def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
     """
     Create a vocabulary which is a mapping from bucket names to lists of
     synonyms that fall into their bucket. Stopwords is a list of words that
     are ignored for the vocabulary and defaults to a built-in english
     stopword list.
     """
     self.stopwords = stopwords
     self.stemmer = snowballstemmer.stemmer("english")
     self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
     self.logging = logging
     if samples:
         self._generate_vocabulary(samples, limit)
开发者ID:janukobytsch,项目名称:wikimedia-image-classification,代码行数:13,代码来源:words.py


示例11: create_search_terms

def create_search_terms(string_terms):
  ''' Creates search terms by stemming every word within the parameter passed.
  Returns all search terms in one string separated by space'''
  stemmer = snowballstemmer.stemmer('english')
  terms = stemmer.stemWords(string_terms.split())

  search_term = list()
  for term in terms:
    lower_term = term.lower()
    if not lower_term in _STOP_WORDS:
      search_term.append(lower_term)

  return " ".join(search_term)
开发者ID:Trekafe,项目名称:trekafe_web,代码行数:13,代码来源:search_terms.py


示例12: search_result

def search_result(request):
    query = request.POST.get('query')
    q_words = query.split()
    stemmed_words = []
    for word in q_words:
        lng = detect(word)
        if lng in LANGUAGES:
            lng = LANGUAGES[lng]
            stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
        else:
            stemmed_words.append(word)

    return render(request, 'searchres/search_result.html', {})
开发者ID:alehat,项目名称:searchengine,代码行数:13,代码来源:views.py


示例13: getPalabras

def getPalabras():
    file = "dicc.txt"

    arc = open(file, 'r')
    stemmer = snowballstemmer.stemmer('spanish');
         
    
    words = {}
    for i in arc:
        i = i.rstrip()
        i = stemmer.stemWord(i)
        words[i] = "word"
    
    for i in words.items():
        print i
    
    print len(words) 
开发者ID:andoniVT,项目名称:OpinionMiningProject,代码行数:17,代码来源:Utils.py


示例14: get_coursed_and_create_matrix

def get_coursed_and_create_matrix():
    results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
    new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
    print new_matrix.matrix.shape[0] != len(results)
    if new_matrix.matrix.shape[0] != len(results):
        all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]

        MatrixEdxCoursesId.objects.all().delete()
        map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)

        stemmer = snowballstemmer.stemmer("english")
        courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]

        vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
        matrix = vect.fit_transform(courses_stem)
        new_matrix.matrix = matrix
        new_matrix.save()
开发者ID:vz10,项目名称:edx_telegram_bot,代码行数:17,代码来源:prediction.py


示例15: identify_language

 def identify_language(self, text):
     self.lang = lang_mapping[langid.classify(text)[0]]
     if self.debug: print "LANG", self.lang#, "stemmer", self.stem
     
     if self.lang == "greek":
         from stemmers.greek import stem, stopwords 
         self.stem = stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
         import snowballstemmer
         from stemmers.turkish import stopwords 
         self.stem = snowballstemmer.stemmer("turkish").stemWord
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     else:
         from nltk.stem import SnowballStemmer
         from nltk.corpus import stopwords
         self.stem = SnowballStemmer(self.lang).stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
开发者ID:hymloth,项目名称:pyredise,代码行数:18,代码来源:index_base.py


示例16: checkon

def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if 'title' not in o.json.keys():
		if verbose:
			print('No title in', o.getKey())
		return 1 # no title
	# check for a different language - to avoid stemming altogether
	if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
		if 'stemmed' in o.json.keys():
			# if stemmed before marked foreign, remove this info
			del o.json['stemmed']
			F = open(fn, 'w')
			F.write(o.getJSON())
			F.close()
			return 2
		else:
			return 0
	changed = False
	### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
	stemmer = snowballstemmer.stemmer('english').stemWords
	### disregarded variant: snowballstemmer porter - considered outdated
	# stemmer = snowballstemmer.stemmer('porter').stemWords
	### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
	# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
	### disregarded variant: nltk - worse on verbs ending with -ze
	# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
	### end variants
	stemmed = stemmer(string2words(o.get('title')))
	if '' in stemmed:
		print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
		print(string2words(o.get('title')))
		print(stemmer(string2words(o.get('title'))))
	ALLSTEMS.update(stemmed)
	if o.get('stemmed') != stemmed:
		o.json['stemmed'] = stemmed
		changed = True
	if changed:
		F = open(fn, 'w')
		F.write(o.getJSON())
		F.close()
		return 2
	else:
		return 0
开发者ID:bibtex,项目名称:bibsleigh,代码行数:44,代码来源:refine-stem.py


示例17: main

def main():
    argv = sys.argv
    if len(argv) < 2:
        usage()
        return
    algorithm = 'english'
    if len(argv) > 2:
        algorithm = argv[1]
        argv = argv[2:]
    else:
        argv = argv[1:]
    stemmer = snowballstemmer.stemmer(algorithm)
    splitter = re.compile(r"[\s\.-]")
    for arg in argv:
        for word in splitter.split(arg):
            if word == '':
                continue
            original = word.lower()
            print(original + " -> " + stemmer.stemWord(original))
开发者ID:Marslo,项目名称:VimConfig,代码行数:19,代码来源:testapp.py


示例18: preprocess_features

def preprocess_features(dataframe):
    # get the count of how many times each product appears, may correlate
    product_counts = pandas.DataFrame(pandas.Series(dataframe.groupby(["product_uid"]).size(), name="product_count"))
    dataframe = pandas.merge(dataframe, product_counts, left_on="product_uid", right_index=True, how="left")

    dataframe = experiment_gensim(dataframe)

    dataframe["search_length"] = dataframe.search_term.str.len()

    dataframe["id_bins"] = pandas.cut(dataframe.id, 20, labels=False)

    # word distribution metrics
    dataframe["title_unigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1), axis=1)
    dataframe["title_bigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2), axis=1)

    dataframe["desc_unigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1), axis=1)
    dataframe["desc_bigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2), axis=1)

    dataframe["brand_unigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(1), axis=1)
    dataframe["brand_bigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(2), axis=1)

    # stemmed unigrams
    stemmer = snowballstemmer.stemmer("english")
    dataframe["title_unigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_unigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["title_bigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_bigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)


    # edit distance metrics (slow)
    dataframe["title_word_edit_distance"] = dataframe[["search_term", "product_title"]].apply(word_edit_distance, axis=1)
    dataframe["title_char_edit_distance"] = dataframe[["search_term", "product_title"]].apply(char_edit_distance, axis=1)
    # dataframe["desc_word_edit_distance"] = dataframe[["search_term", "product_description"]].apply(word_edit_distance, axis=1)
    # dataframe["desc_char_edit_distance"] = dataframe[["search_term", "product_description"]].apply(char_edit_distance, axis=1)

    dataframe = dataframe.drop(["product_title", "search_term", "id", "product_description", "brand_name"], axis=1)

    print(dataframe.describe())

    return dataframe
开发者ID:ktrnka,项目名称:kaggle-home-depot,代码行数:40,代码来源:experiment.py


示例19: textrank

def textrank(text, hdr):
    sent_tokenizer = PunktSentenceTokenizer()
    sentences = sent_tokenizer.tokenize(text)
    word_tokenizer = RegexpTokenizer(r'\w+')

    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
    words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code
开发者ID:Why-Not-Sky,项目名称:wanish,代码行数:22,代码来源:summarizer.py


示例20: seeker_highlight

def seeker_highlight(text, query, algorithm='english'):
    try:
        import snowballstemmer
        stemmer = snowballstemmer.stemmer(algorithm)
        stemWord = stemmer.stemWord
        stemWords = stemmer.stemWords
    except:
        stemWord = lambda word: word
        stemWords = lambda words: words
    phrases = _phrase_re.findall(query)
    keywords = [w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w]
    highlight = set(stemWords(keywords))
    text = seeker_format(text)
    for phrase in phrases:
        text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I)
    parts = []
    for word in re.split(r'(\W+)', text):
        if stemWord(word.lower()) in highlight:
            parts.append('<em>%s</em>' % word)
        else:
            parts.append(word)
    return ''.join(parts)
开发者ID:lefcourn,项目名称:django-seeker,代码行数:22,代码来源:seeker.py



注:本文中的snowballstemmer.stemmer函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python snslog.SNSLog类代码示例发布时间:2022-05-27
下一篇:
Python snmp_helper.snmp_get_oid_v3函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap