本文整理汇总了Python中nltk.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了regexp_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_freqs
def get_freqs(text):
stop_words = nltk.corpus.stopwords.words('english')
frequencies = defaultdict(int)
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
if type(text) == list:
print 'number of links: '+ str(len(text))
for t in text:
content = t['content']
tokens = nltk.regexp_tokenize(content, pattern)
for word in tokens:
if len(word) > 2 and word.lower() not in stop_words:
cap = word[0].upper() + word[1:]
frequencies[cap] += 1
else:
tokens = nltk.regexp_tokenize(text, pattern)
for word in tokens:
if len(word) > 2 and word not in stop_words:
frequencies[word] += 1
print "frequency size: "+str(len(frequencies))
return frequencies
开发者ID:seemless,项目名称:chainlink,代码行数:29,代码来源:chainlink_util.py
示例2: bag_of_words
def bag_of_words(data, label_codebook, feature_codebook, theta):
""""""
word_dict = Alphabet()
stopset = set(stopwords.words('english'))
for key, value in data.items():
label_codebook.add(key)
for doc in value:
doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
for word in doc_tokens:
if word not in stopset:
word_dict.add(word)
all_words = word_dict._label_to_index.keys()
fdict = FreqDist([w for w in all_words])
word_feature = fdict.keys()[theta:]
for word in all_words:
if word in word_feature:
feature_codebook.add(word)
instance_list = {}
for label, document_list in data.items():
instance_list[label] = []
for document in document_list:
vector = np.zeros(feature_codebook.size())
tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
indice = 0
for word in tokens:
if feature_codebook.has_label(word):
indice = feature_codebook.get_index(word)
vector[indice] = 1.
instance_list[label].append(vector)
return instance_list
开发者ID:Juicechuan,项目名称:workspace,代码行数:33,代码来源:naive_bayes.py
示例3: load
def load(f=str):
import re
files = open(f)
raw = files.read()
pattern = re.compile(r"""\$?\d+(\.\d+)?%? # currency
\d+/\d+/\d+ #dates""", re.VERBOSE)
nltk.regexp_tokenize(raw, pattern)
开发者ID:MariaSpyropoulou,项目名称:NLTK-Book,代码行数:7,代码来源:Chapter3.py
示例4: nltkTest
def nltkTest():
s = "russia licenza 8.1.5 U.S."
res = nltk.regexp_tokenize(s, helper.nltkPattern)
print(res)
s = "Saldo vs. Fattura n. 2015/004"
res = nltk.regexp_tokenize(s, helper.nltkPattern)
print(res)
开发者ID:cynricshu,项目名称:ChinaVis2016,代码行数:8,代码来源:handleSubject.py
示例5: regularExpressionTokenizer
def regularExpressionTokenizer():
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
print nltk.regexp_tokenize(text, pattern)
开发者ID:hbdhj,项目名称:python,代码行数:10,代码来源:chapter3.py
示例6: get_links
def get_links(text):
# checks only for 'http://...' and 'www...'
text = text + " "
pat = "http://.*?\s"
links = nltk.regexp_tokenize(text, pat)
text = " " + text + " "
pat = "\swww\..*?\..*?\s"
links.extend(nltk.regexp_tokenize(text, pat))
links = map(lambda x: x[:-1], links)
return links
开发者ID:ItsLastDay,项目名称:Twitter-language-identification,代码行数:10,代码来源:string_processing.py
示例7: poss_test
def poss_test(test_file,test_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(test_file)
reader = csv.reader(f)
t = open(test_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
stopwords = sw
print "停顿词表长度",len(stopwords)
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a == 0:
a += 1
continue
if a%1000 == 0:
print a
a += 1
#if a == 8:
# sys.exit(1)
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#work tokenize
pattern = r"([a-z])\w+"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#light stem
#title = set([stem(word) for word in title])
#body = set(body)
#body = set([stem(word) for word in body])
#remove stopwords
#body = filter(g,body)
#title = filter(g,title)
body = ' '.join(body)
title = ' '.join(title)
t.write('%s , %s \n'%(title,body))
开发者ID:rve,项目名称:keyword,代码行数:53,代码来源:stem.py
示例8: poss_test
def poss_test(test_file,test_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(test_file)
reader = csv.reader(f)
t = open(test_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
#stopwords = sw
stopwords = nltk.corpus.stopwords.words('english')
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a%10000 == 0:
print(a)
a += 1
#if a == 8:
# sys.exit(1)
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#work tokenize
pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#remove stopwords
body = filter(g,body)
title = filter(g,title)
#light stem
title = set([stem(word) for word in title])
body = set(body)
body = set([stem(word) for word in body])
body = ' '.join(body)
title = ' '.join(title)
t.write('"%s","%s","%s"\n'%(row[0],title,body))
开发者ID:rve,项目名称:keyword,代码行数:51,代码来源:nltk_without_stem.py
示例9: query_episode
def query_episode(self, show_title,
ep_title, se_number, ep_number, runtime):
"""build video list prior to scoring
"""
qres = {}
# Query 1
qlist = (show_title, ep_title)
# Search YouTube
tmp = self.search('%s %s' % qlist)
for k, v in tmp.items():
qres[k] = v
# Query 2
qlist = (show_title, ep_title,
se_number, ep_number)
# Search YouTube
tmp = self.search('%s %s %s %s' % qlist)
for k, v in tmp.items():
qres[k] = v
# Query 3
qlist = (show_title,
se_number, ep_number)
# Search YouTube
tmp = self.search('%s s%02de%02d' % qlist)
for k, v in tmp.items():
qres[k] = v
# Show tokens
sh_stem = [self._lancaster.stem(t) \
for t in nltk.regexp_tokenize(
show_title.encode('utf8'), r"\w+")]
# Episode stem tokens if exist
if ep_title:
ep_stem = [self._lancaster.stem(t) \
for t in nltk.regexp_tokenize(
ep_title.encode('utf8'), r"\w+")]
else:
ep_stem = None
res = {'Output': qres,
'Input': {},}
res['Input']['show_title'] = show_title
res['Input']['ep_title'] = ep_title
res['Input']['sh_stem'] = sh_stem
res['Input']['ep_stem'] = ep_stem
res['Input']['se_number'] = se_number
res['Input']['ep_number'] = ep_number
res['Input']['runtime'] = runtime
return res
开发者ID:BrianDurham,项目名称:couchtube,代码行数:51,代码来源:ytquery.py
示例10: poss_train
def poss_train(train_file,train_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(train_file)
reader = csv.reader(f)
t = open(train_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
#stopwords = sw # use nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')
print "停顿词表长度",len(stopwords)
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a%100000 == 0:
print a
a += 1
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#word tokenize
pattern = r"([a-z])\w+"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#remove stopwords
body = filter(g,body)
title = filter(g,title)
#light stem
#st = LancasterStemmer()
title = set([stem(word) for word in title])
body = set(body)
body = set([stem(word) for word in body])
# list to string
body = ' '.join(body)
title = ' '.join(title)
t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
开发者ID:rve,项目名称:keyword,代码行数:50,代码来源:pre_nltk.py
示例11: normalized
def normalized(text, lowercase=True, fix=True, tuples=False):
"""Tokenize, remove capitalization and exclude punctuation
"""
if fix:
text = fix_text(unicode(text))
pattern = r"""(?x) # verbose regexps
\w+(-\w+)* # words with optional internal hyphens
"""
result = [w for w in nltk.regexp_tokenize(text, pattern)]
if lowercase:
result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
if tuples:
result = tuple(result)
return result
开发者ID:elyase,项目名称:eikon_challenge,代码行数:14,代码来源:utils.py
示例12: compute_df
def compute_df(self, document_list):
'''Compute document frequency based on input document list'''
df_cache = dict()
df_output = dict()
d_index = 0
for document in document_list:
d_index += 1
# tokenize each document
reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
for item in reg_toks:
# change each word to lower case and lemmatize
item = normalise(item)
if item not in df_cache:
df_cache[item] = set([d_index])
else:
df_cache[item].add(d_index)
for item in df_cache:
if acceptable_word(item):
df_output[item] = len(df_cache[item])
df_output['total_document'] = len(document_list)
return df_output
开发者ID:luotigerlsx,项目名称:DataAnalysis_ML,代码行数:25,代码来源:keyword_extract.py
示例13: main
def main(self, text):
"""Breaks a single string into a tree using the grammar and returns
the specified words as a string."""
if text is None:
return None
try:
text = text.encode("ascii", "ignore")
except:
text = text.decode("utf-8", "ignore").encode("ascii", "ignore")
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
#print postoks
tree = chunker.parse(postoks)
terms = self.get_terms(tree)
words = self.get_words(terms)
return words
开发者ID:hongyu89,项目名称:IndeedScraper,代码行数:25,代码来源:GrammarParser.py
示例14: generate_vocab
def generate_vocab(papers):
"""Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.
Args:
papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)
Returns:
list of strings: the list of tokens forming the vocabulary
"""
sc = StringCleaner()
# Generate author's vocabulary
corpus = " ".join(p[1] + " " + p[2] for p in papers)
# Cleaning
corpus = sc.clean_string(corpus)
# Tokenization
pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
# we keep tokens that are words (with optional internal hyphens), acronyms and percentages
tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
num_re = re.compile("^\d+$")
tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
# Stemming
porter = nltk.stem.PorterStemmer()
return [porter.stem(t) for t in tokens]
开发者ID:tizot,项目名称:recom-system,代码行数:25,代码来源:dataset_tools.py
示例15: extract
def extract(self, text):
''' Extract and freudify noun phrases from text, return all succesfully
freudified noun phrases. '''
toks = nltk.regexp_tokenize(text, self.sentence_re)
postoks = nltk.tag.pos_tag(toks)
tree = self.chunker.parse(postoks)
terms = self._get_terms(tree)
phrases = sets.Set()
# Loop through all the noun phrases and try to freudify them.
for term in terms:
if (len(term)) < 2: continue
changed = False
context = ""
phrase = []
for part in term:
word, tag = part
word = word.encode('ascii', 'replace')
phrase.append(word.lower())
rpl = self.replace_word(tag[:2], word)
if len(rpl[2]) > 0:
context = rpl[2]
phrase[-1] = rpl[0]
changed = True
if changed:
phrase = " ".join(phrase).strip()
phrase.encode('ascii', 'replace')
phrase = str(phrase)
if phrase not in self.own_phrases[context]:
phrases.add((str(phrase), context))
phrases = list(phrases)
return phrases
开发者ID:assamite,项目名称:agentwordgame,代码行数:35,代码来源:freud.py
示例16: ShowCollocations
def ShowCollocations():
text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n")
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']'''
data = resultsbox.get(1.0,END)
rawtext=nltk.regexp_tokenize(data, pattern)
prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()]
text.delete(1.0, END)
text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n")
text.insert(END, "\nBigram Collocations:\n")
bigram = BigramAssocMeasures()
bigramfinder = BigramCollocationFinder.from_words(prepcolloc)
bigramfinder.apply_freq_filter (3)
bigrams=bigramfinder.nbest(bigram.pmi, 10)
for item in bigrams:
first = item[0]
second = item[1]
text.insert(END, first)
text.insert(END, " ")
text.insert(END, second)
text.insert(END, "\n")
开发者ID:muranava,项目名称:Text-Tools,代码行数:25,代码来源:collocationreadability.py
示例17: word_couple_con_puntuacion_pares_minusculas
def word_couple_con_puntuacion_pares_minusculas(lista):
word_couples = []
regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+-*[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+|[.]+|[/,$?:;!()&%#=+{}*~.]+|[0-9]+"
for oracion in lista:
#oracion = str(oracion)
#oracion = oracion.to_lower
#print oracion
tokens = nltk.regexp_tokenize(oracion.lower(), regexp)
#print len(tokens)
# tokens_lower = []
# for i in range(len(tokens)):
# palabra = str(tokens[i])
# tokens_lower.append(palabra.to_lower() )
pairs = list(itertools.permutations(tokens, 2))
for pair in pairs:
word_couples.append(pair[0]+"~"+pair[1])
return word_couples
开发者ID:jesusmiguelgarcia,项目名称:FSTmikes,代码行数:27,代码来源:attr_util_mk.py
示例18: handle_doc
def handle_doc(word_set,rs_path):
doc_dir = os.listdir(rs_path)
doc_matrix = []
doc_cat = []
for docs in doc_dir:
files = os.listdir(rs_path+docs)
print "start to handle the --> "+docs
for file_d in files:
d_path = rs_path+docs+'/'+file_d
#get the single file path
with open(d_path,'rb') as text_file:
str_tmp = ''
file_lines = text_file.readlines()
for line in file_lines:
pattern = r'''[a-zA-Z]+'''
tokens = nltk.regexp_tokenize(line,pattern)
for t in tokens:
if t.lower() in word_set:
str_tmp += t.lower()
str_tmp += ' '
doc_matrix.append(str_tmp)
doc_cat.append(cat_dic[docs])
text_file.close()
str_tmp = ''
for sw in word_set:
str_tmp += sw
str_tmp += ' '
doc_matrix.append(str_tmp)
doc_cat.append('NAN')
vectorizer = CountVectorizer()
doc_num = vectorizer.fit_transform(doc_matrix)
tfidf = TfidfTransformer()
doc_tfidf = tfidf.fit_transform(doc_num)
return doc_tfidf[:-1,:],doc_cat[:-1]
开发者ID:CharLLCH,项目名称:work-for-py,代码行数:34,代码来源:adjust_word.py
示例19: longitud_promedio_palabras_moens
def longitud_promedio_palabras_moens(lista):
regexp = "[a-zA-Z'ÁÉÍÓÚáéíóúñÑüÜ]+"
total_palabras_en_oraciones = 0
num_oraciones = 0
tokens = 0
promedio_longitud_palabras_oraciones = []
for oracion in lista:
total_palabras_oracion = 0
num_palabras_oracion = 0
tokens = nltk.regexp_tokenize(oracion, regexp)
total_palabras_en_oraciones += len(tokens)
for palabra in tokens:
total_palabras_oracion += len(palabra)
num_palabras_oracion += 1
#print palabra
#print len(palabra)
if total_palabras_oracion > 0:
promedio_longitud_palabras_oraciones.append(total_palabras_oracion/num_palabras_oracion)
else:
print oracion
#print len(tokens)
#total += len(oracion.split())
num_oraciones += 1
#promedio = total_palabras_en_oraciones / num_oraciones
#print promedio_longitud_palabras_oraciones
suma_promedios=0
num_promedios = 0
for promedios in promedio_longitud_palabras_oraciones:
suma_promedios += promedios
num_promedios += 1
promedio = suma_promedios/num_promedios
#promedio = sum(promedio_longitud_palabras_oraciones)/float(len(promedio_longitud_palabras_oraciones))
return promedio
开发者ID:jesusmiguelgarcia,项目名称:FSTmikes,代码行数:34,代码来源:attr_util_mk.py
示例20: handleSubject1
def handleSubject1(outputFile):
"""
:return: dict
"""
index = 0
termdict = dict()
subjectList = list()
f = open("data/topic/subject1_w_date.txt")
for item in f:
array = item.strip().split("DELIMER")
count = array[0]
subject = array[3]
for (regex, repl) in helper.regexList.items():
subject = regex.sub(repl, subject)
for s in helper.specialSet:
subject = subject.replace(s, "")
termList = nltk.regexp_tokenize(subject, helper.nltkPattern) # use nltk-package to participle the subject
s = ""
for term in termList:
if term.lower() not in helper.excludeSet:
s += term + " " # reconstruct the subject
if term not in termdict:
termdict[term.strip()] = index
index += 1
if s != "":
regex = re.compile("\s+")
s = regex.sub(" ", s)
subjectList.append("{}DELIMER{}DELIMER{}DELIMER{}".format(count, array[1], array[2], s.strip()))
fileHelper.writeIterableToFile(outputFile, subjectList)
return termdict
开发者ID:cynricshu,项目名称:ChinaVis2016,代码行数:35,代码来源:handleSubject.py
注:本文中的nltk.regexp_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论