本文整理汇总了Python中nltk.corpus.stopwords.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: split_count
def split_count(sentence): # split the sentence and count where each words come from
# how about making a parallel list and then nameing the language and word in the same index #box
vocab_list = []
languages_ratios = {}
split = wordpunct_tokenize(sentence) # tokenizes the input
words = [word.lower()for word in split] # makes sentence lower in the list split
lang_dict = {}
for language in stopwords.fileids(): # iterate through a list of lang built in
stopwords_set = set(stopwords.words(language))
words_set = set(words) # creates a set of words
vocab_list = words # good
# print "this is word set: " ,words_set
#print "this is vocablist: " , vocab_list
common_element = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_element) # this will detrm}ain the score
lang_dict[language] = common_element # works like intend, but want to make Cleaner
#main_language_set =
#secondary_lang = lang_dict.intersection( secondary_lang)
# print "size of vocab: ",len(vocab_list) #,"and lang ", len(lang_list) ---Delete
# for i in range(len(vocab_list)):
# print lang_list[i],vocab_list[i]
# print "----------------------------"
print "this is the set for main lang:", lang_dict.get(main_language), "\n"
print "this is the set for second lang:", lang_dict.get(secondary_lang),"\n"
# print "this lang. ratios", languages_ratios , "\n"
# print "this is lang list: ",lang_list
print "this is vocb_list: ", vocab_list , "\n" # check good
print "this is DICT: ", lang_dict
print "ORIGINAL SENTENCE: " , sentence
开发者ID:guti15,项目名称:School-Jr_second_semester,代码行数:32,代码来源:THINK_april.py
示例2: hello_world
def hello_world():
if request.method == 'POST':
print "Request: ", request
print "Form: ", request.form
print "Files: ", request.files
archive = zipfile.ZipFile(request.files.get("solution"))
with archive.open("extra.txt") as solution:
languages_ratios = {}
tokens = nltk.wordpunct_tokenize(solution.read().decode('utf-8'))
words_list = [word.lower() for word in tokens]
words_set = set(words_list)
print "Words_set: ", words_set
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
common_elements = words_set.intersection(stopwords_set)
if common_elements:
languages_ratios[language] = len(common_elements)
print "Language ratios: ", languages_ratios
# 50%
mark = 50 if max(languages_ratios, key=languages_ratios.get) == 'english' else 0
# 50%
print "Mark for lang: ", mark
words_count = len(words_list)
print "Words count: ", words_count
mark += (float(words_count) / 200) * 50 if words_count < 200 else 50
print "Total Mark: ", mark
req = requests.post(request.form["url"], data={"mark": int(mark)})
return ''
开发者ID:Cybran111,项目名称:learningsystem-extra-aux,代码行数:31,代码来源:learningsystem-auxserver.py
示例3: calculate_language_scores
def calculate_language_scores(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.
:param text: Text to analyze.
:type text: str
:return: Dictionary with languages and unique stopwords seen in analyzed text.
:rtype: dict(str -> int)
:raises: TypeError
"""
if not isinstance(text, basestring):
raise TypeError("Expected basestring, got '%s' instead" % type(text))
if not text:
return {}
languages_ratios = {}
# Split the text into separate tokens, using natural language punctuation signs.
tokens = wordpunct_tokenize(text)
tokenized_words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(tokenized_words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
开发者ID:Autoscan,项目名称:golismero,代码行数:31,代码来源:natural_language.py
示例4: _calculate_languages_ratios
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
开发者ID:annamarie-g,项目名称:capstone_project,代码行数:30,代码来源:clean_dataframe.py
示例5: detect_language
def detect_language(comment):
"""
To detect language we could compare a comment to stopwords from each language. The language that has most
stopwords in common with the comment is likely to be the language in which the comment is written. This is obviously
not waterproof, however, a well written comment would work way better than a comment written in slang or with poor
grammar. Ultimately, this would likely result in comments that are more valuable because of their structure.
In addition, languages that are easily distinguished from English could be detected, thus being able to compare the
language of a comment to the actual content that is annotated in Hypothes.is, since most users won't understand
comments in a different language anyway.
"""
# first we tokenize the comment
tokens = wordpunct_tokenize(comment)
words = [word.lower() for word in tokens]
languages_ratios = {}
# Then we compare the words to the most frequent stopwords per language
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
# Calculate the language score
languages_ratios[language] = len(common_elements)
# Get the key with the highest value
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
开发者ID:JakeHartnell,项目名称:NLPFinalProject,代码行数:30,代码来源:test_features.py
示例6: calcularValoresDeIdioma
def calcularValoresDeIdioma(contenido):
languages_ratios = {}
tokens = wordpunct_tokenize(contenido)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
return languages_ratios
开发者ID:andreslechuga,项目名称:arte_mexicano_antiguo,代码行数:10,代码来源:textract.py
示例7: calculate_languages_ratios
def calculate_languages_ratios(text):
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
开发者ID:rashmi1403,项目名称:Language_Detection,代码行数:10,代码来源:multiLanguage_stopwords.py
示例8: detectLanguage
def detectLanguage(self, text):
languages_scores = {}
tokens = word_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords
# appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_scores[language] = len(common_elements) # language "score"
return max(languages_scores, key=languages_scores.get)
开发者ID:rusad,项目名称:summarizer,代码行数:13,代码来源:summarizer.py
示例9: check_language
def check_language(self, word_list):
""" source: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/"""
languages_ratios = {}
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(word_list)
# Check similarity
common_elements = words_set.intersection(stopwords_set)
# Save as ratio
languages_ratios[language] = len(common_elements)
# Get language with most similarities
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
开发者ID:sagieske,项目名称:NLP1,代码行数:14,代码来源:preprocessing_sharonstest.py
示例10: cal
def cal():
text = sys.stdin.read()
languages_ratios = {}
toekns = wordpunct_tokenize(text)
words = [word.lower() for word in toekns]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
ratios = languages_ratios
most = max(ratios, key=ratios.get)
print (most)
"""if most == "english":
开发者ID:shark-S,项目名称:Text-to-language-detect,代码行数:15,代码来源:language-detect-nltk.py
示例11: _calculate_languages_ratios
def _calculate_languages_ratios(text):
text = str(text) # assuring we receive a String
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
开发者ID:edlectrico,项目名称:nltk_sentiment_analysis,代码行数:15,代码来源:language_detector.py
示例12: _calculate_languages_ratios
def _calculate_languages_ratios(self, text):
#Calcule la probabilité d'avoir un text écrit dans telle ou telle languages et
#retourne un dictionnaire qui ressemble à {'french': 2, 'english': 4, 'dutsh': 0}
languages_ratios = {}
tokens = self.getWords(text)
# Compte par language le nombre de stopwords qui apparait.
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(tokens)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # nombre d'aparition de stopwords par langue
return languages_ratios
开发者ID:VictorYt,项目名称:P_ADT_r_BIG_2015,代码行数:16,代码来源:detect.py
示例13: language_detector
def language_detector(string):
tokens = wordpunct_tokenize(string)
words = [word.lower() for word in tokens]
# compute language scores
languages_ratios = {}
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
languages_ratios
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
开发者ID:nsaleille,项目名称:nlp_mylanguageexchange,代码行数:16,代码来源:ntlk_language_detector.py
示例14: lang_likelihood
def lang_likelihood(self, document):
''' This method computes the language likelihood using algorithm
and tokenizer from NLTK.
'''
languages_likelihood = {}
tokens = wordpunct_tokenize(document)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_likelihood[language] = len(common_elements) # language "score"
return languages_likelihood
开发者ID:onebit1984,项目名称:epidetect,代码行数:17,代码来源:epidetect.py
示例15: _calculate_languages_ratios
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
if (language == "portuguese"):
lista=stopwords.words(language)
lista.append('Fatec')
lista.append('fatec')
lista.append('Palmeiras')
lista.append('palmeiras')
lista.append('Dilma')
lista.append('dilma')
lista.append('Copa')
lista.append('copa')
stopwords_set=set(lista)
else:
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
开发者ID:silviolima,项目名称:analise-Sentimento-nltk,代码行数:44,代码来源:testarMsg.py
示例16: calculate_language_scores
def calculate_language_scores(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.
:param text: Text to analyze.
:type text: str
:return: Dictionary with languages and unique stopwords seen in analyzed text.
:rtype: dict(str -> int)
"""
# Split the text into separate tokens, using natural language punctuation signs.
words = {word.lower() for word in wordpunct_tokenize(text)}
# Return the number of stopwords found per language.
return {
len( words.intersection( stopwords.words(language) ) )
for language in stopwords.fileids()
}
开发者ID:IFGHou,项目名称:golismero,代码行数:20,代码来源:natural_language.py
示例17: main
def main():
#step 1 which tokenizes the words and now we have clean words to stop match against stop
print "\n -----------------------------\n"
split = wordpunct_tokenize("hola como estas, espero que estes bien" )
print split
print "\n -----------------------------\n"
#
#Lets Get serious
#
languages_ratios = {}
tokens = wordpunct_tokenize("hola como estas?")
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_element = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_element) # this will detrmain the score
print languages_ratios
开发者ID:guti15,项目名称:School-Jr_second_semester,代码行数:20,代码来源:startingNLTK.py
示例18: identify_language
def identify_language(text):
"""
Identify a language, given a text of that language.
Parameters
----------
text : str
Returns
-------
list of tuples (ISO 369-3, score)
Examples
--------
>>> identify_language('Ich gehe zur Schule.')
[('deu', 0.8)]
"""
languages_ratios = []
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Check how many stopwords of the languages NLTK knows appear in the
# provided text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
score = len(common_elements)
languages_ratios.append((language, score))
# Normalize
sum_scores = float(sum(el[1] for el in languages_ratios))
languages_ratios = [(_nltk_to_iso369_3(el[0]), el[1])
for el in languages_ratios]
if sum_scores > 0:
languages_ratios = [(el[0], el[1] / sum_scores)
for el in languages_ratios]
return sorted(languages_ratios, key=lambda n: n[1], reverse=True)
开发者ID:MartinThoma,项目名称:algorithms,代码行数:41,代码来源:language_identification.py
示例19: detect_lang
def detect_lang(self,text):
""" Returns the detected language.
Args:
text: input text
Returns:
the detectred language string
"""
language_ratio = {}
words = wordpunct_tokenize(text)
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_words = words_set.intersection(stopwords_set)
language_ratio[language] = len(common_words)
detected_lang = max(language_ratio, key=language_ratio.get)
return detected_lang
开发者ID:gdamdam,项目名称:sumo,代码行数:21,代码来源:analyzer.py
示例20: capitalize
def capitalize(text):
"""
Text capitalizator for Python 2.
"""
if isinstance(text, str):
text = text.decode("utf-8")
if set(text) & CYRILLIC_ALPHABET:
language = "russian"
else:
words = set(wordpunct_tokenize(text.lower()))
language = max(
stopwords.fileids(),
key=lambda lang: len(words & PRECALCULATED_LANGSETS[lang])
)
class_ = EnglishCapitalization
if language == "russian":
class_ = RussianCapitalization
elif language == "spanish":
class_ = SpanishCapitalization
elif language == "dutch":
class_ = DutchCapitalization
return class_().capitalize(text)
开发者ID:9seconds,项目名称:rymtracks,代码行数:23,代码来源:py2.py
注:本文中的nltk.corpus.stopwords.fileids函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论