本文整理汇总了Python中wordsegment.segment函数的典型用法代码示例。如果您正苦于以下问题:Python segment函数的具体用法?Python segment怎么用?Python segment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了segment函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main(arg="iamtoocoolforthis"):
s = clean(arg)
print "CLEANED STRING:", s
print "======================RUNNING OPTIMIZED==================="
print segment_method1(s)
print "======================RUNNING VANILLA==================="
print segment(s)
开发者ID:nitin7,项目名称:WordBreak,代码行数:8,代码来源:example.py
示例2: precisioncalc
def precisioncalc(query):
print query,
k = searchgoogle(query)
seg = segment(query)
m = []
for n in seg:
m.append(stemming.porter2.stem(n))
seg = " ".join(m)
if socialListProxy:
proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
counter = 0
total = 0
for i in xrange(len(k)):
req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
k[i] = segment(k[i])
l = []
for j in k[i]:
l.append(stemming.porter2.stem(j))
k[i] = " ".join(k[i])
# print k[i]
try:
content = ulib.urlopen(req)
x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
t = []
for s in x:
t.append(stemming.porter2.stem(s))
t = " ".join(t)
# print t
if ((seg in k[i]) or (seg in t)):
counter = counter + 1
total = total + 1
except:
pass
if (total == 10):
print str(counter)+"/"+str(total),
if (total == 20):
print str(counter)+"/"+str(total),
if total < 10:
print str(counter)+"/"+str(10), str(counter)+"/"+str(20)
elif total < 20:
print str(counter)+"/"+str(20)
else:
print ""
#precisioncalc("madhusai") #uncomment this to check the presion of some word
开发者ID:SummerProject16,项目名称:SocialList,代码行数:49,代码来源:precision.py
示例3: info_extract
def info_extract(u):
final_string = ""
twe=url.split(u)
newtweet=""
for a in range(len(twe)):
newtweet = newtweet+twe[a]+" "
text = sep.split(newtweet);
tex=""
for i in range(len(text)):
if(hasht.match(text[i]) or atp.match(text[i])):
m=text[i][1:]
text[i]=segment(m.lower())
n=""
for j in text[i]:
n=n+j+" "
text[i]=n
tex+=text[i]+" "
final_string=final_string+categorize(tex)+"####"
final_string=final_string+babelnet(tex)+"####"
twee = url.search(u)
try:
urls = str(twee.group(0))
final_string=final_string+url_categ(urls)+"<br>"
except:
pass
final_string=final_string+twe_cat(tex)+"####"
final_string=final_string+senti(u)+"####"
return final_string
开发者ID:InfoExtr2015,项目名称:Retrieval_Extraction_Information,代码行数:32,代码来源:__init__.py
示例4: create_dict
def create_dict():
relation_name=[x[2] for x in os.walk("nell/relations")][0]
sub_table={}
obj_table={}
for r in relation_name:
lst=[]
r_name=' '.join(segment(r.split(':')[1]))
print r_name
with open("nell/relations/"+r) as fp:
for line in fp:
line=line.rstrip('\n')
sub,obj=line.split('\t')
sub=' '.join((sub.split(":")[2]).split('_'))
obj=' '.join((obj.split(":")[2]).split('_'))
if sub in sub_table:
tmp=sub_table[sub]
tmp=tmp.union([r_name])
sub_table[sub]=tmp
#print("y")
else:
sub_table[sub]=set([r_name])
if obj in obj_table:
tmp=obj_table[obj]
tmp=tmp.union([r_name])
obj_table[obj]=tmp
#print("yy")
else:
obj_table[obj]=set([r_name])
#print len(sub_table[sub]),len(obj_table[obj])
return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:30,代码来源:relations_dict.py
示例5: test_segment_12
def test_segment_12():
result = [
'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the',
'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of',
'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
]
assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py
示例6: test_segment_10
def test_segment_10():
result = [
'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy',
'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed',
'into', 'a', 'gigantic', 'insect'
]
assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py
示例7: test_segment_9
def test_segment_9():
result = [
'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst',
'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was',
'the', 'age', 'of', 'foolishness'
]
assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py
示例8: k_list_repeat
def k_list_repeat(query):
k = searchgoogle(query)
m = []
if socialListProxy:
proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
for i in xrange(len(k)):
req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
k[i] = segment(k[i])
l = []
for j in k[i]:
l.append(stemming.porter2.stem(j))
k[i] = " ".join(k[i])
# print k[i]
try:
content = ulib.urlopen(req)
#reading the title of url
x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
t = []
for s in x:
t.append(stemming.porter2.stem(s))
t = " ".join(t)
m.append(t)
except:
pass
return m
开发者ID:SummerProject16,项目名称:SocialList,代码行数:30,代码来源:k_list_repeat.py
示例9: segment_hashtag
def segment_hashtag(h):
"""segment the words inside the hashtag h, discard non alphanum chars"""
if hasattr(h, "group"):
h = h.group()[1:]
else:
h = h[1:]
# print(h, " hashtag " + wordsegment.segment(h) + " . ")
return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
开发者ID:lambros-mavrides,项目名称:ml_practice,代码行数:8,代码来源:morph.py
示例10: get_word_vector
def get_word_vector(self, word):
if word is None:
return None
word = word.strip().strip('[').strip(']').strip('(').strip(')')
word_lower = word.lower()
word_upper = word.upper()
try:
if word_lower not in self.word_vectors_map:
if config.debug:
print 'getting word vector for ', word
if word in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word]
#todo: if vocab us ensured to be lower case, this condition is not required
elif word_lower in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word_lower]
elif word_upper in self.word2vec_model.vocab:
self.word_vectors_map[word_lower] = self.word2vec_model[word_upper]
else:
if not constants.concept_regexp.sub('', word):
return self.get_word_vector(constants.alpha_regex.sub('', word))
subwords = word.split()
if len(subwords) == 1:
subwords = word.split(',')
if len(subwords) == 1:
subwords = word.split('/')
if len(subwords) == 1:
subwords = word.split(':')
if len(subwords) == 1:
subwords = word.split('-')
if len(subwords) == 1:
subwords = word.split('_')
if len(subwords) == 1:
# print 'performing word segmentation on ', word
subwords = ws.segment(word.encode('utf8'))
if len(subwords) == 1:
print 'could not get wordvector for ', word
self.word_vectors_map[word_lower] = None
if len(subwords) > 1:
curr_wordvec = None
for curr_subword in subwords:
curr_subword_vec = self.get_word_vector(curr_subword)
if curr_subword_vec is not None:
if curr_wordvec is None:
curr_wordvec = curr_subword_vec
else:
start_time = time.time()
curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same')
if config.debug:
print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time)
self.word_vectors_map[word_lower] = curr_wordvec
return self.word_vectors_map[word_lower]
except UnicodeDecodeError as ude:
print 'error getting word vector for ', word
print ude.message
self.word_vectors_map[word_lower] = None
return self.word_vectors_map[word_lower]
开发者ID:sgarg87,项目名称:big_mech_isi_gg,代码行数:56,代码来源:word_vectors.py
示例11: read_nell_relations
def read_nell_relations():
"""
this function will read relations from nell graph
return the list of relations
"""
rel=os.walk("nell/relations")
relation=[]
for i in rel:
trel=i[2]
for i in trel:
relation.append(' '.join(segment(i.split(':')[1])))
return relation
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:13,代码来源:read_data.py
示例12: test12
def test12(tagtocheck):
d=en.Dict("en-US")
correct = 0
incorrect = 0
words=ws.segment(tagtocheck)
for x in words:
if d.check(x)==False:
incorrect+=1
else:
correct+=1
if correct!= 0:
return "%.4f"%(float(incorrect)/correct)
else:
return 0
开发者ID:SummerProject16,项目名称:SocialList,代码行数:14,代码来源:testFile12.py
示例13: create_dict_adva
def create_dict_adva():
relation_name=[x[2] for x in os.walk("nell/relations")][0]
sub_table={}
obj_table={}
for r in relation_name:
lst=[]
r_name=' '.join(segment(r.split(':')[1]))
print r_name
with open("nell/relations/"+r) as fp:
for line in fp:
line=line.rstrip('\n')
sub,obj=line.split('\t')
sub=sub.split(":")[1:]
obj=obj.split(":")[1:]
for tmp in sub:
tmpsb=''.join(tmp.split('_'))
tmpsb=segment(tmpsb)
for sb in tmpsb:
if sb in sub_table:
tmp=sub_table[sb]
tmp=tmp.union([r_name])
sub_table[sb]=tmp
#print("y")
else:
sub_table[sb]=set([r_name])
for tmp in obj:
tmpob=''.join(tmp.split('_'))
tmpob=segment(tmpob)
for ob in tmpob:
if ob in obj_table:
tmp=obj_table[ob]
tmp=tmp.union([r_name])
obj_table[ob]=tmp
#print("yy")
else:
obj_table[ob]=set([r_name])
return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:37,代码来源:relations_dict.py
示例14: read_relation_name
def read_relation_name(folder_name):
"""
This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation".
return the list of relations
"""
#print folder_name
folder_list=[]
#print folder_name
tmp=[x[0] for x in os.walk(folder_name)]
#print tmp
for name in tmp[1:]:
#print name
folder_list.append(' '.join(segment(name.split(':')[1])))
return folder_list[1:]
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:15,代码来源:read_data.py
示例15: checkTweetNums
def checkTweetNums(tweets,minTweets):
#number as adjective check
count = 0
processedtweets = []
for line in tweets:
processedtweets.append(" ".join(wordsegment.segment(line)))
postags = cmu.runtagger_parse(processedtweets)
for postag in postags:
postag = "".join(postag)
if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
#Checking for Consecutive numbers and Nouns
count += 1
if count >= minTweets:
return 1
else:
return 0
开发者ID:SummerProject16,项目名称:opinion-or-fact,代码行数:16,代码来源:TweetCheck.py
示例16: pos_tag_entropy
def pos_tag_entropy(tagtocheck,pos_list):
seg_st = segment(tagtocheck)
len_list=len(pos_list)
arr = []
freq_list =[]
for i in xrange(len_list):
arr.append(pos_list[i])
k = Counter(arr) #counts no of pos tags and their multiplicity
for x in k:
freq = float(k[x])/len_list
freq_list.append(freq)
ent = 0.0
for j in freq_list:
ent = ent + j * math.log(j, 2)
ent = -ent
return "%.4f"%(float(ent))
开发者ID:SummerProject16,项目名称:SocialList,代码行数:16,代码来源:testFile11.py
示例17: getchunks
def getchunks(password):
# split into character/digit/symbols chunks
temp = re.findall('([\W_]+|[a-zA-Z]+|[0-9]+)', password)
# split character chunks into word chunks
chunks = []
for chunk in temp:
if chunk[0].isalpha() and len(chunk) > 1:
words = ws.segment(chunk)
chunks.extend(words)
else:
chunks.append(chunk)
if len(chunks) == 0:
log.warning("Unable to chunk password: {}".format(password))
return chunks
开发者ID:vialab,项目名称:semantic-guesser,代码行数:17,代码来源:train.py
示例18: getWeight
def getWeight(hashtag,text_file):
#this function returns a list of weights of the strings in the text_file
#proxy_handler
proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
#split the hashtag into words
spl_hash = ws.segment(hashtag)
req = ulib.Request('https://www.google.co.in/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
dumpdata = ulib.urlopen(req).read()
dumpdata = ulib.unquote(dumpdata)
urls_ = re.findall("(http[s]*://[^:<&%]*?)[\"& ]",dumpdata)
urls = Set()
for _ in urls_:
if not "google" in _ and not "youtube" in _:
urls.add(_)
occurance = []
for _url in urls:
try:
temp = get_occurence_list(_url,text_file)
occurance.append(temp)
#frequencies of string for url _url
except:
pass
#now occurance is a list of lists containing frequencies for each url
final = [0 for _ in range(len(occurance[0]))]
_length = len(occurance)
#_length is total number of urls present
for _x in range(len(occurance[0])):
_x1 = 0
for _o in occurance:
final[_x] += _o[_x]*(_length-_x1)
#multiplyinng frequency in each url with url position from bottom which gives weight
_x1 += 1
return final
开发者ID:SummerProject16,项目名称:SocialList,代码行数:43,代码来源:new_file.py
示例19: checkCategories
def checkCategories(hashtag):
matches =[]
hashtag = " ".join(ws.segment(hashtag))
matches.append(re.match(".+?in\s\d+\swords",hashtag))
matches.append(re.match(".+?in\s\d+\ssentences",hashtag))
matches.append(re.match(".*?\d+\sreasons.+",hashtag))
matches.append(re.match(".*?\d+\swords\sto.+",hashtag))
matches.append(re.match("^reasons\s.+",hashtag))
matches.append(re.match(".*?ways\sto.+",hashtag))
matches.append(re.match(".*?how\sto.+",hashtag))
matches.append(re.match(".*?\d+\sways\sto.+",hashtag))
matches.append(re.match(".*?\d+\sthings\sto.+",hashtag))
matches.append(re.match("^things.+",hashtag))
matches.append(re.match("^describe.*?in.*?",hashtag))
matches.append(re.match("^name\ssome.+?",hashtag))
#Add new catogories if found any
for match in matches:
if match:
return 1
return 0
开发者ID:SummerProject16,项目名称:SocialList,代码行数:20,代码来源:Category.py
示例20: getWeight
def getWeight(hashtag="",string=""):
proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
opener = ulib.build_opener(proxy)
ulib.install_opener(opener)
spl_hash = ws.segment(hashtag)
req = ulib.Request('https://www.google.com/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
dumpdata = ulib.urlopen(req).read()
urls = re.findall("(http.*?)[\" ]",dumpdata)
weight = 0
url = len(urls)
occurance = []
for _url in urls:
req = ulib.Request(_url,headers={'User-Agent' : "Mozilla/5.0"})
try:
pagedata = ulib.urlopen(req).read()
pagedata = pagedata.lower()
occurance = re.findall(string.lower(),pagedata)
weight+=len(occurance)*url
except:
pass
url-=1
return weight
开发者ID:SummerProject16,项目名称:SocialList,代码行数:22,代码来源:frequency.py
注:本文中的wordsegment.segment函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论