• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python wordsegment.segment函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中wordsegment.segment函数的典型用法代码示例。如果您正苦于以下问题:Python segment函数的具体用法?Python segment怎么用?Python segment使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了segment函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: main

def main(arg="iamtoocoolforthis"):

    s = clean(arg)
    print "CLEANED STRING:", s
    print "======================RUNNING OPTIMIZED==================="
    print segment_method1(s)
    print "======================RUNNING VANILLA==================="
    print segment(s)
开发者ID:nitin7,项目名称:WordBreak,代码行数:8,代码来源:example.py


示例2: precisioncalc

def precisioncalc(query):
	print query,
	k = searchgoogle(query)
	seg = segment(query)
	m = []
	for n in seg:
		m.append(stemming.porter2.stem(n))
	seg = " ".join(m)
	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)
	counter = 0
	total = 0
	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			# print t
			if ((seg in k[i]) or (seg in t)):
				counter = counter + 1
			total = total + 1
		except:
			pass

		if (total == 10):
			print str(counter)+"/"+str(total),
		if (total == 20):
			print str(counter)+"/"+str(total),


	if total < 10:
		print str(counter)+"/"+str(10), str(counter)+"/"+str(20)
	elif total < 20:
		print str(counter)+"/"+str(20)
	else:
		print ""
#precisioncalc("madhusai") #uncomment this to check the presion of some word
开发者ID:SummerProject16,项目名称:SocialList,代码行数:49,代码来源:precision.py


示例3: info_extract

def info_extract(u):
		
        final_string = ""
        twe=url.split(u)

        newtweet=""
        for a in range(len(twe)):
            newtweet = newtweet+twe[a]+" "

        text = sep.split(newtweet);
        tex=""    
        for i in range(len(text)):
                if(hasht.match(text[i]) or atp.match(text[i])):
                        m=text[i][1:]
                        text[i]=segment(m.lower())
                        n=""
                        for j in text[i]:
                            n=n+j+" "
                        text[i]=n
                tex+=text[i]+" "

        final_string=final_string+categorize(tex)+"####"
        final_string=final_string+babelnet(tex)+"####"
        twee = url.search(u)
        try:
            urls = str(twee.group(0))
            final_string=final_string+url_categ(urls)+"<br>"
        except:
            pass
        final_string=final_string+twe_cat(tex)+"####"
        final_string=final_string+senti(u)+"####"
        return final_string
开发者ID:InfoExtr2015,项目名称:Retrieval_Extraction_Information,代码行数:32,代码来源:__init__.py


示例4: create_dict

def create_dict():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=' '.join((sub.split(":")[2]).split('_'))
				obj=' '.join((obj.split(":")[2]).split('_'))
				if sub in sub_table:
					tmp=sub_table[sub]
					tmp=tmp.union([r_name])
					sub_table[sub]=tmp
					#print("y")
				else:
					sub_table[sub]=set([r_name])
				if obj in obj_table:
					tmp=obj_table[obj]
					tmp=tmp.union([r_name])
					obj_table[obj]=tmp
					#print("yy")
				else:
					obj_table[obj]=set([r_name])
				#print len(sub_table[sub]),len(obj_table[obj])
	return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:30,代码来源:relations_dict.py


示例5: test_segment_12

def test_segment_12():
    result = [
        'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the',
        'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of',
        'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py


示例6: test_segment_10

def test_segment_10():
    result = [
        'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy',
        'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed',
        'into', 'a', 'gigantic', 'insect'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py


示例7: test_segment_9

def test_segment_9():
    result = [
        'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst',
        'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was',
        'the', 'age', 'of', 'foolishness'
    ]
    assert segment(''.join(result)) == result
开发者ID:grantjenks,项目名称:wordsegment,代码行数:7,代码来源:test_coverage.py


示例8: k_list_repeat

def k_list_repeat(query):
	k = searchgoogle(query)
	m = []

	if socialListProxy:
		proxy = ulib.ProxyHandler({'https': socialListHttps_Proxy, 'http': socialListHttp_Proxy})
		opener = ulib.build_opener(proxy)
		ulib.install_opener(opener)

	for i in xrange(len(k)):
		req = ulib.Request(k[i], headers={'User-Agent': "Mozilla/5.0"})
		k[i] = segment(k[i])
		l = []
		for j in k[i]:
			l.append(stemming.porter2.stem(j))
		k[i] = " ".join(k[i])
		# print k[i]
		try:
			content = ulib.urlopen(req)
			#reading the title of url
			x = re.findall("<\S*?title\S*?>(.*?)<\S*?/\S*?title\S*?>", content.read())
			t = []
			for s in x:
				t.append(stemming.porter2.stem(s))
			t = " ".join(t)
			m.append(t)

		except:
			pass
	return m
开发者ID:SummerProject16,项目名称:SocialList,代码行数:30,代码来源:k_list_repeat.py


示例9: segment_hashtag

def segment_hashtag(h):
    """segment the words inside the hashtag h, discard non alphanum chars"""
    if hasattr(h, "group"):
        h = h.group()[1:]
    else:
        h = h[1:]
    # print(h, " hashtag " + wordsegment.segment(h) + " . ")
    return " hashtag " + " ".join(wordsegment.segment(h)) + " , "
开发者ID:lambros-mavrides,项目名称:ml_practice,代码行数:8,代码来源:morph.py


示例10: get_word_vector

 def get_word_vector(self, word):
     if word is None:
         return None
     word = word.strip().strip('[').strip(']').strip('(').strip(')')
     word_lower = word.lower()
     word_upper = word.upper()
     try:
         if word_lower not in self.word_vectors_map:
             if config.debug:
                 print 'getting word vector for ', word
             if word in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word]
             #todo: if vocab us ensured to be lower case, this condition is not required
             elif word_lower in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_lower]
             elif word_upper in self.word2vec_model.vocab:
                 self.word_vectors_map[word_lower] = self.word2vec_model[word_upper]
             else:
                 if not constants.concept_regexp.sub('', word):
                     return self.get_word_vector(constants.alpha_regex.sub('', word))
                 subwords = word.split()
                 if len(subwords) == 1:
                     subwords = word.split(',')
                     if len(subwords) == 1:
                         subwords = word.split('/')
                         if len(subwords) == 1:
                             subwords = word.split(':')
                             if len(subwords) == 1:
                                 subwords = word.split('-')
                                 if len(subwords) == 1:
                                     subwords = word.split('_')
                                     if len(subwords) == 1:
                                         # print 'performing word segmentation on ', word
                                         subwords = ws.segment(word.encode('utf8'))
                                         if len(subwords) == 1:
                                             print 'could not get wordvector for ', word
                                             self.word_vectors_map[word_lower] = None
                 if len(subwords) > 1:
                     curr_wordvec = None
                     for curr_subword in subwords:
                         curr_subword_vec = self.get_word_vector(curr_subword)
                         if curr_subword_vec is not None:
                             if curr_wordvec is None:
                                 curr_wordvec = curr_subword_vec
                             else:
                                 start_time = time.time()
                                 curr_wordvec = ss.fftconvolve(curr_wordvec, curr_subword_vec, mode='same')
                                 if config.debug:
                                     print 'performed fast fourier transform convolution on word vectors in {} seconds.'.format(time.time()-start_time)
                     self.word_vectors_map[word_lower] = curr_wordvec
         return self.word_vectors_map[word_lower]
     except UnicodeDecodeError as ude:
         print 'error getting word vector for ', word
         print ude.message
         self.word_vectors_map[word_lower] = None
         return self.word_vectors_map[word_lower]
开发者ID:sgarg87,项目名称:big_mech_isi_gg,代码行数:56,代码来源:word_vectors.py


示例11: read_nell_relations

def read_nell_relations():
	"""
		this function will read relations from nell graph
		
		return the list of relations
	"""
	rel=os.walk("nell/relations")
	relation=[]
	for i in rel:
		trel=i[2]
	for i in trel:
		relation.append(' '.join(segment(i.split(':')[1])))
	return relation
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:13,代码来源:read_data.py


示例12: test12

def test12(tagtocheck):
	d=en.Dict("en-US")
	correct = 0
	incorrect = 0
	words=ws.segment(tagtocheck)
	for x in words:
		if d.check(x)==False:
			incorrect+=1
		else:
			correct+=1
	if correct!= 0:
		return "%.4f"%(float(incorrect)/correct)
	else:
		return 0
开发者ID:SummerProject16,项目名称:SocialList,代码行数:14,代码来源:testFile12.py


示例13: create_dict_adva

def create_dict_adva():
	relation_name=[x[2] for x in os.walk("nell/relations")][0]
	sub_table={}
	obj_table={}
	for r in relation_name:
		lst=[]
		r_name=' '.join(segment(r.split(':')[1]))
		print r_name
		with open("nell/relations/"+r) as fp:
			for line in fp:
				line=line.rstrip('\n')
				sub,obj=line.split('\t')
				sub=sub.split(":")[1:]
				obj=obj.split(":")[1:]
				for tmp in sub:
					tmpsb=''.join(tmp.split('_'))
					tmpsb=segment(tmpsb)
					for sb in tmpsb:
						if sb in sub_table:
							tmp=sub_table[sb]
							tmp=tmp.union([r_name])
							sub_table[sb]=tmp
							#print("y")
						else:
							sub_table[sb]=set([r_name])
				for tmp in obj:
					tmpob=''.join(tmp.split('_'))
					tmpob=segment(tmpob)
					for ob in tmpob:
						if ob in obj_table:
							tmp=obj_table[ob]
							tmp=tmp.union([r_name])
							obj_table[ob]=tmp
							#print("yy")
						else:
							obj_table[ob]=set([r_name])
	return sub_table,obj_table
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:37,代码来源:relations_dict.py


示例14: read_relation_name

def read_relation_name(folder_name):
	"""
		This function will look inside the folder folder_name and fetch out all relations where relations are the name of inside folder names. Here each folder name should have name format "concept:relation".
		
		return the list of relations
	"""
	#print folder_name
	folder_list=[]
	#print folder_name
	tmp=[x[0] for x in os.walk(folder_name)]
	#print tmp
	for name in tmp[1:]:
		#print name
		folder_list.append(' '.join(segment(name.split(':')[1])))
	return folder_list[1:]
开发者ID:vedsarkushwaha,项目名称:KBH_NELL,代码行数:15,代码来源:read_data.py


示例15: checkTweetNums

def checkTweetNums(tweets,minTweets):
	#number as adjective check
	count = 0
	processedtweets = []
	for line in tweets:
		processedtweets.append(" ".join(wordsegment.segment(line)))
	postags = cmu.runtagger_parse(processedtweets)
	for postag in postags:
		postag = "".join(postag)
		if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag:
			#Checking for Consecutive numbers and Nouns
			count += 1
	if count >= minTweets:
		return 1
	else:
		return 0
开发者ID:SummerProject16,项目名称:opinion-or-fact,代码行数:16,代码来源:TweetCheck.py


示例16: pos_tag_entropy

def pos_tag_entropy(tagtocheck,pos_list):
    seg_st = segment(tagtocheck)
    len_list=len(pos_list)
    arr = []
    freq_list =[]
    for i in xrange(len_list):
        arr.append(pos_list[i])
    k = Counter(arr) #counts no of pos tags and their multiplicity
    for x in k:
        freq = float(k[x])/len_list
        freq_list.append(freq)
    ent = 0.0
    for j in freq_list:
        ent = ent + j * math.log(j, 2)
    ent = -ent
    return "%.4f"%(float(ent))
开发者ID:SummerProject16,项目名称:SocialList,代码行数:16,代码来源:testFile11.py


示例17: getchunks

def getchunks(password):
    # split into character/digit/symbols chunks
    temp = re.findall('([\W_]+|[a-zA-Z]+|[0-9]+)', password)

    # split character chunks into word chunks
    chunks = []
    for chunk in temp:
        if chunk[0].isalpha() and len(chunk) > 1:
            words = ws.segment(chunk)
            chunks.extend(words)
        else:
            chunks.append(chunk)

    if len(chunks) == 0:
        log.warning("Unable to chunk password: {}".format(password))

    return chunks
开发者ID:vialab,项目名称:semantic-guesser,代码行数:17,代码来源:train.py


示例18: getWeight

def getWeight(hashtag,text_file):
	#this function returns a list of weights of the strings in the text_file
	#proxy_handler
	proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
	opener = ulib.build_opener(proxy)
	ulib.install_opener(opener)
	#split the hashtag into words
	spl_hash = ws.segment(hashtag)
	req = ulib.Request('https://www.google.co.in/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
	dumpdata = ulib.urlopen(req).read()
	dumpdata = ulib.unquote(dumpdata)
	
	urls_ = re.findall("(http[s]*://[^:<&%]*?)[\"& ]",dumpdata)
	
	urls = Set()
	
	for _ in urls_:
		if not "google" in _ and not "youtube" in _:
			urls.add(_)
	
	occurance = []
	for _url in urls:
		try:
			temp = get_occurence_list(_url,text_file)
			occurance.append(temp)
			#frequencies of string for url _url
		except:
			pass

	#now occurance is a list of lists containing frequencies for each url
	
	final = [0 for _ in range(len(occurance[0]))]

	_length = len(occurance)
	#_length is total number of urls present
	
	for _x in range(len(occurance[0])):
		_x1 = 0
		for _o in occurance:
			final[_x] += _o[_x]*(_length-_x1)
			#multiplyinng frequency in each url with url position from bottom which gives weight
			_x1 += 1
	return final
开发者ID:SummerProject16,项目名称:SocialList,代码行数:43,代码来源:new_file.py


示例19: checkCategories

def checkCategories(hashtag):
	matches =[]
	hashtag = " ".join(ws.segment(hashtag))
	matches.append(re.match(".+?in\s\d+\swords",hashtag))
	matches.append(re.match(".+?in\s\d+\ssentences",hashtag))
	matches.append(re.match(".*?\d+\sreasons.+",hashtag))
	matches.append(re.match(".*?\d+\swords\sto.+",hashtag))
	matches.append(re.match("^reasons\s.+",hashtag))
	matches.append(re.match(".*?ways\sto.+",hashtag))
	matches.append(re.match(".*?how\sto.+",hashtag))
	matches.append(re.match(".*?\d+\sways\sto.+",hashtag))
	matches.append(re.match(".*?\d+\sthings\sto.+",hashtag))
	matches.append(re.match("^things.+",hashtag))
	matches.append(re.match("^describe.*?in.*?",hashtag))
	matches.append(re.match("^name\ssome.+?",hashtag))
	#Add new catogories if found any
	for match in matches:
		if match:
			return 1
	return 0
开发者ID:SummerProject16,项目名称:SocialList,代码行数:20,代码来源:Category.py


示例20: getWeight

def getWeight(hashtag="",string=""):
	proxy = ulib.ProxyHandler({'https': 'https://10.3.100.207:8080','http' : 'http://10.3.100.207:8080'})
	opener = ulib.build_opener(proxy)
	ulib.install_opener(opener)
	spl_hash = ws.segment(hashtag)
	req = ulib.Request('https://www.google.com/search?q='+'+'.join(spl_hash), headers={'User-Agent' : "Mozilla/5.0"})
	dumpdata = ulib.urlopen(req).read()
	urls = re.findall("(http.*?)[\" ]",dumpdata)
	weight = 0
	url = len(urls)
	occurance = []
	for _url in urls:
		req = ulib.Request(_url,headers={'User-Agent' : "Mozilla/5.0"})
		try:
			pagedata = ulib.urlopen(req).read()
			pagedata = pagedata.lower()
			occurance = re.findall(string.lower(),pagedata)
			weight+=len(occurance)*url
		except:
			pass
		url-=1
	return weight
开发者ID:SummerProject16,项目名称:SocialList,代码行数:22,代码来源:frequency.py



注:本文中的wordsegment.segment函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python wordy.calculate函数代码示例发布时间:2022-05-26
下一篇:
Python wordpress_xmlrpc.WordPressPost类代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap