本文整理汇总了Python中pyprepbuddy.rdds.transformable_rdd.TransformableRDD类的典型用法代码示例。如果您正苦于以下问题:Python TransformableRDD类的具体用法?Python TransformableRDD怎么用?Python TransformableRDD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TransformableRDD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_should_smooth_data_by_Simple_Moving_Average
def test_should_smooth_data_by_Simple_Moving_Average(self):
initial_dataset = self.sc.parallelize(
["52,3,53", "23,4,64", "23,5,64", "23,6,64", "23,7,64", "23,8,64", "23,9,64"], 3)
transformable_rdd = TransformableRDD(initial_dataset, "csv")
transformed = transformable_rdd.smooth(1, SimpleMovingAverage(3))
excepted = 4.0
self.assertEquals(excepted, transformed.first())
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:7,代码来源:smoothers_test.py
示例2: test_replace_values_should_replace_cluster_values_with_given_text
def test_replace_values_should_replace_cluster_values_with_given_text(self):
initial_dataset = self.sc.parallelize(["XA,Y", "A,B", "AX,Z", "A,Q", "A,E"])
transformable_rdd = TransformableRDD(initial_dataset)
clusters = transformable_rdd.clusters(0, NGramFingerprintAlgorithm(1))
one_cluster = clusters.get_all_clusters()[0]
values = transformable_rdd.replace_values(one_cluster, "Hello", 0).collect()
self.assertTrue(values.__contains__("Hello,B"))
开发者ID:data-commons,项目名称:prep-buddy,代码行数:7,代码来源:cluster_test.py
示例3: test_pivot_table_by_count_should_give_pivoted_table
def test_pivot_table_by_count_should_give_pivoted_table(self):
initial_dataSet = self.sc.parallelize([
"known,new,long,home,skips",
"unknown,new,short,work,reads",
"unknown,follow Up,long,work,skips",
"known,follow Up,long,home,skips",
"known,new,short,home,reads",
"known,follow Up,long,work,skips",
"unknown,follow Up,short,work,skips",
"unknown,new,short,work,reads",
"known,follow Up,long,home,skips",
"known,new,long,work,skips",
"unknown,follow Up,short,home,skips",
"known,new,long,work,skips",
"known,follow Up,short,home,reads",
"known,new,short,work,reads",
"known,new,short,home,reads",
"known,follow Up,short,work,reads",
"known,new,short,home,reads",
"unknown,new,short,work,reads"
])
initial_rdd = TransformableRDD(initial_dataSet, "csv")
table = initial_rdd.pivot_by_count(4, [0, 1, 2, 3])
entry = table.value_at("skips", "known")
self.assertEqual(6, entry)
self.assertEqual(3, table.value_at("skips", "unknown"))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:26,代码来源:transformable_rdd_test.py
示例4: test_should_split_given_column_indexes_split_by_delimiter_with_retain_column
def test_should_split_given_column_indexes_split_by_delimiter_with_retain_column(self):
initial_data_set = self.sc.parallelize(["FirstName LastName MiddleName,850"])
initial_rdd = TransformableRDD(initial_data_set, "csv")
split_with_retained_columns = initial_rdd.split_by_delimiter(0, " ", True)
self.assertEquals("FirstName LastName MiddleName,850,FirstName,LastName,MiddleName",
split_with_retained_columns.first())
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:7,代码来源:split_columns_test.py
示例5: test_clusters_should_give_clusters_of_given_column_index
def test_clusters_should_give_clusters_of_given_column_index(self):
rdd = self.sc.parallelize(["CLUSTER Of Finger print", "finger print of cluster", "finger print for cluster"])
transformable_rdd = TransformableRDD(rdd, 'csv')
clusters = transformable_rdd.clusters(0, SimpleFingerprint())
list_of_clusters = clusters.get_all_clusters()
one_cluster = list_of_clusters[0]
self.assertTrue(one_cluster.__contains__("CLUSTER Of Finger print"))
self.assertFalse(one_cluster.__contains__("finger print for cluster"))
开发者ID:data-commons,项目名称:prep-buddy,代码行数:8,代码来源:cluster_test.py
示例6: test_clusters_should_give_clusters_By_n_gram_fingerprint
def test_clusters_should_give_clusters_By_n_gram_fingerprint(self):
rdd = self.sc.parallelize(["CLUSTER Of Finger print", "finger print of cluster", "finger print for cluster"])
transformable_rdd = TransformableRDD(rdd, 'csv')
clusters = transformable_rdd.clusters(0, NGramFingerprintAlgorithm(1))
list_of_clusters = clusters.get_all_clusters()
one_cluster = list_of_clusters[0]
self.assertTrue(one_cluster.__contains__("CLUSTER Of Finger print"))
self.assertTrue(one_cluster.__contains__("finger print for cluster"))
开发者ID:data-commons,项目名称:prep-buddy,代码行数:8,代码来源:cluster_test.py
示例7: test_multiply_column_should_multiply_two_given_column
def test_multiply_column_should_multiply_two_given_column(self):
initial_dataset = self.sc.parallelize(["1,1", "1,2", "1,3"])
transformable_rdd = TransformableRDD(initial_dataset)
multiplied_rdd = transformable_rdd.multiply_columns(0, 1)
collected = multiplied_rdd.collect()
self.assertTrue(collected.__contains__(1.0))
self.assertTrue(collected.__contains__(2.0))
self.assertTrue(collected.__contains__(3.0))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:8,代码来源:transformable_rdd_test.py
示例8: test_to_double_rdd_should_change_string_to_double_rdd
def test_to_double_rdd_should_change_string_to_double_rdd(self):
initial_dataset = self.sc.parallelize(["1,1", "5,2", "8,3"])
transformable_rdd = TransformableRDD(initial_dataset)
rdd = transformable_rdd.to_double_rdd(0)
collected = rdd.collect()
self.assertTrue(collected.__contains__(1.0))
self.assertTrue(collected.__contains__(5.0))
self.assertTrue(collected.__contains__(8.0))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:8,代码来源:transformable_rdd_test.py
示例9: test_map_should_give_Transformable_rdd
def test_map_should_give_Transformable_rdd(self):
initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"])
transformable_rdd = TransformableRDD(initial_dataset, "csv")
rdd_map = transformable_rdd.map(lambda line: line + "yes")
deduplicate = rdd_map.deduplicate()
collected = deduplicate.collect()
self.assertEqual(2, collected.__len__())
expected = "1,2yes"
self.assertTrue(collected.__contains__(expected))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:9,代码来源:transformable_rdd_test.py
示例10: test_should_split_given_column_by_field_length_with_retained_columns
def test_should_split_given_column_by_field_length_with_retained_columns(self):
data = ["John,Male,21,+914382313832,Canada", "Smith, Male, 30,+015314343462, UK",
"Larry, Male, 23,+009815432975, USA", "Fiona, Female,18,+891015709854,USA"]
initial_data_set = self.sc.parallelize(data)
initial_rdd = TransformableRDD(initial_data_set, "csv")
result = initial_rdd.split_by_field_length(3, [3, 10], True).collect()
self.assertTrue(len(result) == 4)
self.assertTrue(result.__contains__("John,Male,21,+914382313832,Canada,+91,4382313832"))
self.assertTrue(result.__contains__("Smith,Male,30,+015314343462,UK,+01,5314343462"))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:11,代码来源:split_columns_test.py
示例11: test_transformableRDD_can_impute_the_missing_values_by_NaiveBayesSubstitution
def test_transformableRDD_can_impute_the_missing_values_by_NaiveBayesSubstitution(self):
rdd = self.sc.parallelize(["Drew,No,Blue,Short,Male",
"Claudia,Yes,Brown,Long,Female",
"Drew,No,Blue,Long,Female",
"Drew,No,Blue,Long,Female",
"Alberto,Yes,Brown,Short,Male",
"Karin,No,Blue,Long,Female",
"Nina,Yes,Brown,Short,Female",
"Sergio,Yes,Blue,Long,Male",
"Drew,Yes,Blue,Long,"])
transformable_rdd = TransformableRDD(rdd, 'csv')
imputed_rdd = transformable_rdd.impute(4, NaiveBayesSubstitution(0, 1, 2, 3))
self.assertTrue(imputed_rdd.collect().__contains__("Drew,Yes,Blue,Long,Female"))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:13,代码来源:transformable_rdd_test.py
示例12: test_should_split_the_given_column_by_delimiter_into_given_number_of_split
def test_should_split_the_given_column_by_delimiter_into_given_number_of_split(self):
data = [
"John\tMale\t21\t+91-4382-313832\tCanada",
"Smith\tMale\t30\t+01-5314-343462\tUK",
"Larry\tMale\t23\t+00-9815-432975\tUSA",
"Fiona\tFemale\t18\t+89-1015-709854\tUSA"
]
initial_data_set = self.sc.parallelize(data)
initial_rdd = TransformableRDD(initial_data_set, "tsv")
new_dataset = initial_rdd.split_by_delimiter(3, "-", False, 2)
list_of_records = new_dataset.collect()
self.assertEqual(4, list_of_records.__len__())
self.assertTrue(list_of_records.__contains__("John\tMale\t21\tCanada\t+91\t4382-313832"))
self.assertTrue(list_of_records.__contains__("Smith\tMale\t30\tUK\t+01\t5314-343462"))
开发者ID:data-commons,项目名称:prep-buddy,代码行数:16,代码来源:split_columns_test.py
示例13: test_should_smooth_data_by_Weighted_Moving_Average
def test_should_smooth_data_by_Weighted_Moving_Average(self):
initial_dataset = self.sc.parallelize(["10", "12", "16", "13", "17", "19", "15", "20", "22", "19", "21", "19"],
3)
transformable_rdd = TransformableRDD(initial_dataset, "csv")
weights = Weights(3)
weights.add(0.166)
weights.add(0.333)
weights.add(0.5)
moving_average = WeightedMovingAverage(3, weights)
rdd = transformable_rdd.smooth(0, moving_average)
expected = 13.656
actual = rdd.first()
self.assertEquals(expected, actual)
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:16,代码来源:smoothers_test.py
示例14: test_should_normalize_by_Z_Score_normalization
def test_should_normalize_by_Z_Score_normalization(self):
initial_dataset = self.sc.parallelize([
"07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010",
"07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"])
transformable_rdd = TransformableRDD(initial_dataset, 'csv')
final_rdd = transformable_rdd.normalize(3, ZScoreNormalizer())
normalized_durations = final_rdd.select(3).collect()
expected1 = "1.944528306701421"
expected2 = "-0.8202659838241843"
expected3 = "-0.2306179123850742"
expected4 = "-0.2306179123850742"
expected5 = "-0.6630264981070882"
self.assertTrue(normalized_durations.__contains__(expected1))
self.assertTrue(normalized_durations.__contains__(expected2))
self.assertTrue(normalized_durations.__contains__(expected3))
self.assertTrue(normalized_durations.__contains__(expected4))
self.assertTrue(normalized_durations.__contains__(expected5))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:21,代码来源:noramalization_test.py
示例15: test_should_normalize_by_Decimal_Scale
def test_should_normalize_by_Decimal_Scale(self):
initial_dataset = self.sc.parallelize([
"07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010",
"07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"])
transformable_rdd = TransformableRDD(initial_dataset, 'csv')
final_rdd = transformable_rdd.normalize(3, DecimalScalingNormalizer())
normalized_durations = final_rdd.select(3).collect()
expected1 = "2.11"
expected2 = "0.0"
expected3 = "0.45"
expected4 = "0.45"
expected5 = "0.12"
self.assertTrue(normalized_durations.__contains__(expected1))
self.assertTrue(normalized_durations.__contains__(expected2))
self.assertTrue(normalized_durations.__contains__(expected3))
self.assertTrue(normalized_durations.__contains__(expected4))
self.assertTrue(normalized_durations.__contains__(expected5))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:21,代码来源:noramalization_test.py
示例16: test_should_normalize_by_Min_Max_normalization
def test_should_normalize_by_Min_Max_normalization(self):
initial_dataset = self.sc.parallelize([
"07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010",
"07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980",
"07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"])
transformable_rdd = TransformableRDD(initial_dataset, 'csv')
final_rdd = transformable_rdd.normalize(3, MinMaxNormalizer(0, 1))
normalized_durations = final_rdd.select(3).collect()
expected1 = "1.0"
expected2 = "0.0"
expected3 = "0.2132701421800948"
expected4 = "0.2132701421800948"
expected5 = "0.05687203791469194"
self.assertTrue(normalized_durations.__contains__(expected1))
self.assertTrue(normalized_durations.__contains__(expected2))
self.assertTrue(normalized_durations.__contains__(expected3))
self.assertTrue(normalized_durations.__contains__(expected4))
self.assertTrue(normalized_durations.__contains__(expected5))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:21,代码来源:noramalization_test.py
示例17: test_add_columns_from_should_merge_all_columns_of_other_transformable_rdd
def test_add_columns_from_should_merge_all_columns_of_other_transformable_rdd(self):
initial_spelled_numbers = self.sc.parallelize([
"One,Two,Three",
"Four,Five,Six",
"Seven,Eight,Nine",
"Ten,Eleven,Twelve"
])
spelled_numbers = TransformableRDD(initial_spelled_numbers, "csv")
initial_numeric_data = self.sc.parallelize([
"1\t2\t3",
"4\t5\t6",
"7\t8\t9",
"10\t11\t12"
])
numeric_data = TransformableRDD(initial_numeric_data, "tsv")
result = spelled_numbers.add_columns_from(numeric_data).collect()
self.assertTrue(result.__contains__("One,Two,Three,1,2,3"))
self.assertTrue(result.__contains__("Four,Five,Six,4,5,6"))
self.assertTrue(result.__contains__("Seven,Eight,Nine,7,8,9"))
self.assertTrue(result.__contains__("Ten,Eleven,Twelve,10,11,12"))
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:22,代码来源:transformable_rdd_test.py
示例18: test_filter_should_give_Transformable_rdd
def test_filter_should_give_Transformable_rdd(self):
initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"])
transformable_rdd = TransformableRDD(initial_dataset, "csv")
rdd_filter = transformable_rdd.filter(lambda line: line.split(",")[1] != "2")
collected = rdd_filter.collect()
self.assertEqual(1, collected.__len__())
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:6,代码来源:transformable_rdd_test.py
示例19: test_should_give_highest_of_facets
def test_should_give_highest_of_facets(self):
initial_dataset = self.sc.parallelize(["X,Y", "A,B", "X,Z", "A,Q", "A,E"])
transformable_rdd = TransformableRDD(initial_dataset)
text_facets = transformable_rdd.list_facets_of(0)
highest = text_facets.highest()
self.assertEqual("A", highest[0]._1())
开发者ID:data-commons,项目名称:prep-buddy,代码行数:6,代码来源:cluster_test.py
示例20: test_get_duplicates_should_give_duplicates_of_given_column_indexes
def test_get_duplicates_should_give_duplicates_of_given_column_indexes(self):
rdd = self.sc.parallelize(["Ram,23", "Ram,23", "Jill,45", "Soa,"])
transformable_rdd = TransformableRDD(rdd, 'csv')
duplicates = transformable_rdd.get_duplicates([0])
self.assertEqual("Ram,23", duplicates.first())
开发者ID:blpabhishek,项目名称:prep-buddy,代码行数:5,代码来源:transformable_rdd_test.py
注:本文中的pyprepbuddy.rdds.transformable_rdd.TransformableRDD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论