• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python simfunctions.tfidf函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中py_stringmatching.simfunctions.tfidf函数的典型用法代码示例。如果您正苦于以下问题:Python tfidf函数的具体用法?Python tfidf怎么用?Python tfidf使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tfidf函数的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_valid_input

 def test_valid_input(self):
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True),
                      0.11166746710505392)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
     self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0)
     self.assertEqual(tfidf([], ['a', 'b', 'a']), 0.0)
开发者ID:alihitawala,项目名称:py_stringmatching,代码行数:10,代码来源:test_simfunctions.py


示例2: test_invalid_input4

 def test_invalid_input4(self):
     tfidf(['a'], None)
开发者ID:kvpradap,项目名称:py_stringmatching,代码行数:2,代码来源:test_simfunctions.py


示例3: test_invalid_input1

 def test_invalid_input1(self):
     tfidf(1, 1)
开发者ID:kvpradap,项目名称:py_stringmatching,代码行数:2,代码来源:test_simfunctions.py


示例4: Score

	attribute_id1 = product_dict[id1]
	attribute_id2 = product_dict[id2]
	id.append([id1,id2])

	# class label
	if (match_dict[pair] == 'MATCH'):
		classlabels.append(1)
	else:
		classlabels.append(0)


	####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
	if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
		jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
		jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
		tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
		edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
		edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
	else:
		jaccard_productName = 0
		jaccard3gram_productName = 0
		tfidf_productName = 0
		edit_productName = 0

	####feature: Manufacturer
	if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
		jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
		jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
		tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
	else:
		jaccard_manufacturer = 0
开发者ID:njuwangchen,项目名称:CS-784,代码行数:31,代码来源:readdata_s3.py


示例5: generate_feature

def generate_feature(filename):
    productName_courpus = []
    brand_courpus = []
    with open(filename, 'r') as f:
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            if "Product Name" in attribute_id1:
		        productName_courpus.append(tokenizers.delimiter(attribute_id1["Product Name"][0]))
            if "Product Name" in attribute_id2:
                productName_courpus.append(tokenizers.delimiter(attribute_id2["Product Name"][0]))

            if "Brand" in attribute_id1:
                brand_courpus.append(tokenizers.delimiter(attribute_id1["Brand"][0]))
            if "Brand" in attribute_id2:
                brand_courpus.append(tokenizers.delimiter(attribute_id2["Brand"][0]))

    feature_matrix = []
    with open(filename, 'r') as f:
        i = 1
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            print 'Generate features for pair', i
            i = i+1

            instance = []

            #Product Name 4
            if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
                jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
                jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
                tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
                edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
                edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
            else:
                jaccard_productName = 0
                jaccard3gram_productName = 0
                tfidf_productName = 0
                edit_productName = 0

            instance += [jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName]

            #Manufacturer 3
            if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
                jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
                jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
                tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
            else:
                jaccard_manufacturer = 0
                jaccard3gram_manufacturer = 0
                tfidf_manufacturer = 0

            instance += [jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer]

            #Color 3
            if ("Color" in attribute_id1 and "Color" in attribute_id2):
                jaccard_color = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
                jaccard3gram_color = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3))
                tfidf_color = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
            else:
                jaccard_color = 0
                jaccard3gram_color = 0
                tfidf_color = 0

            instance += [jaccard_color, jaccard3gram_color, tfidf_color]

            #Product Type 3
            if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2):
                jaccard_productType = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
                jaccard3gram_productType = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Type"][0], 3),tokenizers.qgram(attribute_id2["Product Type"][0], 3))
                tfidf_productType = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
            else:
                jaccard_productType = 0
                jaccard3gram_productType = 0
                tfidf_productType = 0

            instance += [jaccard_productType, jaccard3gram_productType, tfidf_productType]

            #Product Segment 3
            if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2:
                jaccard_productSegment = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Segment"][0]),tokenizers.delimiter(attribute_id2["Product Segment"][0]))
                jaccard3gram_productSegment= simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Segment"][0], 3),tokenizers.qgram(attribute_id2["Product Segment"][0], 3))
                if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]):
                    exactMatch_productSegment = 1
                else:
                    exactMatch_productSegment = 0
            else:
                exactMatch_productSegment = 0
                jaccard_productSegment = 0
                jaccard3gram_productSegment = 0

            instance += [exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment]

            #Brand 4
            if ("Brand" in attribute_id1 and "Brand" in attribute_id2):
#.........这里部分代码省略.........
开发者ID:mtian29,项目名称:CS784-final,代码行数:101,代码来源:feature_generator.py


示例6: time_medium_large_wi_rep

 def time_medium_large_wi_rep(self):
     simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例7: time_medium_large_wo_rep_no_dampen

 def time_medium_large_wo_rep_no_dampen(self):
     simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例8: time_small_large_wo_rep_no_corpus_no_dampen

 def time_small_large_wo_rep_no_corpus_no_dampen(self):
     simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例9: time_small_medium_wo_rep_no_dampen

 def time_small_medium_wo_rep_no_dampen(self):
     simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例10: time_small_large_wi_rep_no_dampen

 def time_small_large_wi_rep_no_dampen(self):
     simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例11: time_medium_large_wi_rep_no_corpus

 def time_medium_large_wi_rep_no_corpus(self):
     simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例12: time_small_large_wi_rep_no_corpus

 def time_small_large_wi_rep_no_corpus(self):
     simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例13: time_small_medium_wo_rep_no_corpus

 def time_small_medium_wo_rep_no_corpus(self):
     simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例14: test_invalid_input2

 def test_invalid_input2(self):
     tfidf(None, ['b'])
开发者ID:kvpradap,项目名称:py_stringmatching,代码行数:2,代码来源:test_simfunctions.py


示例15: time_small_medium_wi_rep

 def time_small_medium_wi_rep(self):
     simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, dampen=True)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py


示例16: test_invalid_input3

 def test_invalid_input3(self):
     tfidf(None, None)
开发者ID:kvpradap,项目名称:py_stringmatching,代码行数:2,代码来源:test_simfunctions.py


示例17: time_small_medium_wi_rep_no_corpus_no_dampen

 def time_small_medium_wi_rep_no_corpus_no_dampen(self):
     simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
开发者ID:sanjibkd,项目名称:py_stringmatching,代码行数:2,代码来源:benchamarks.py



注:本文中的py_stringmatching.simfunctions.tfidf函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python controller.init_host函数代码示例发布时间:2022-05-25
下一篇:
Python py_pjsua.perror函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap