本文整理汇总了Python中pyspark.sql.column._to_seq函数的典型用法代码示例。如果您正苦于以下问题:Python _to_seq函数的具体用法?Python _to_seq怎么用?Python _to_seq使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了_to_seq函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: agg
def agg(self, *exprs):
"""Compute aggregates and returns the result as a :class:`DataFrame`.
The available aggregate functions are `avg`, `max`, `min`, `sum`, `count`.
If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
is the column to perform aggregation on, and the value is the aggregate function.
Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.
:param exprs: a dict mapping from column name (string) to aggregate functions (string),
or a list of :class:`Column`.
>>> gdf = df.groupBy(df.name)
>>> sorted(gdf.agg({"*": "count"}).collect())
[Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]
>>> from pyspark.sql import functions as F
>>> sorted(gdf.agg(F.min(df.age)).collect())
[Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
"""
assert exprs, "exprs should not be empty"
if len(exprs) == 1 and isinstance(exprs[0], dict):
jdf = self._jgd.agg(exprs[0])
else:
# Columns
assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
jdf = self._jgd.agg(exprs[0]._jc,
_to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
return DataFrame(jdf, self.sql_ctx)
开发者ID:1647917675,项目名称:spark,代码行数:30,代码来源:group.py
示例2: bucketBy
def bucketBy(self, numBuckets, col, *cols):
"""Buckets the output by the given columns.If specified,
the output is laid out on the file system similar to Hive's bucketing scheme.
:param numBuckets: the number of buckets to save
:param col: a name of a column, or a list of names.
:param cols: additional names (optional). If `col` is a list it should be empty.
.. note:: Applicable for file-based data sources in combination with
:py:meth:`DataFrameWriter.saveAsTable`.
>>> (df.write.format('parquet') # doctest: +SKIP
... .bucketBy(100, 'year', 'month')
... .mode("overwrite")
... .saveAsTable('bucketed_table'))
"""
if not isinstance(numBuckets, int):
raise TypeError("numBuckets should be an int, got {0}.".format(type(numBuckets)))
if isinstance(col, (list, tuple)):
if cols:
raise ValueError("col is a {0} but cols are not empty".format(type(col)))
col, cols = col[0], col[1:]
if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
raise TypeError("all names should be `str`")
self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
return self
开发者ID:ajatix,项目名称:spark,代码行数:30,代码来源:readwriter.py
示例3: coalesce
def coalesce(*cols):
"""Returns the first column that is not null.
>>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
>>> cDf.show()
+----+----+
| a| b|
+----+----+
|null|null|
| 1|null|
|null| 2|
+----+----+
>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+-------------+
|coalesce(a,b)|
+-------------+
| null|
| 1|
| 2|
+-------------+
>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+----+----+---------------+
| a| b|coalesce(a,0.0)|
+----+----+---------------+
|null|null| 0.0|
| 1|null| 1.0|
|null| 2| 0.0|
+----+----+---------------+
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
return Column(jc)
开发者ID:deanwampler,项目名称:spark,代码行数:34,代码来源:functions.py
示例4: metrics
def metrics(*metrics):
"""
Given a list of metrics, provides a builder that it turns computes metrics from a column.
See the documentation of [[Summarizer]] for an example.
The following metrics are accepted (case sensitive):
- mean: a vector that contains the coefficient-wise mean.
- variance: a vector tha contains the coefficient-wise variance.
- count: the count of all vectors seen.
- numNonzeros: a vector with the number of non-zeros for each coefficients
- max: the maximum for each coefficient.
- min: the minimum for each coefficient.
- normL2: the Euclidean norm for each coefficient.
- normL1: the L1 norm of each coefficient (sum of the absolute values).
:param metrics:
metrics that can be provided.
:return:
an object of :py:class:`pyspark.ml.stat.SummaryBuilder`
Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
interface.
"""
sc = SparkContext._active_spark_context
js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics",
_to_seq(sc, metrics))
return SummaryBuilder(js)
开发者ID:Brett-A,项目名称:spark,代码行数:28,代码来源:stat.py
示例5: parquet
def parquet(self, *paths):
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
>>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
"""
return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
开发者ID:ChenZhongPu,项目名称:Simba,代码行数:8,代码来源:readwriter.py
示例6: concat
def concat(*cols):
"""
Concatenates multiple input string columns together into a single string column.
>>> df = sqlContext.createDataFrame([('abcd','123')], ['s', 'd'])
>>> df.select(concat(df.s, df.d).alias('s')).collect()
[Row(s=u'abcd123')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))
开发者ID:EugenCepoi,项目名称:spark,代码行数:10,代码来源:functions.py
示例7: countDistinct
def countDistinct(col, *cols):
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
>>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
[Row(c=2)]
>>> df.agg(countDistinct("age", "name").alias('c')).collect()
[Row(c=2)]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
return Column(jc)
开发者ID:deanwampler,项目名称:spark,代码行数:12,代码来源:functions.py
示例8: orc
def orc(self, path):
"""Loads ORC files, returning the result as a :class:`DataFrame`.
.. note:: Currently ORC support is only available together with Hive support.
>>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
>>> df.dtypes
[('a', 'bigint'), ('b', 'int'), ('c', 'int')]
"""
if isinstance(path, basestring):
path = [path]
return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
开发者ID:ajatix,项目名称:spark,代码行数:12,代码来源:readwriter.py
示例9: parquet
def parquet(self, *path):
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
>>> import tempfile, shutil
>>> parquetFile = tempfile.mkdtemp()
>>> shutil.rmtree(parquetFile)
>>> df.saveAsParquetFile(parquetFile)
>>> df2 = sqlContext.read.parquet(parquetFile)
>>> sorted(df.collect()) == sorted(df2.collect())
True
"""
return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
开发者ID:ZhangQingcheng,项目名称:spark,代码行数:12,代码来源:readwriter.py
示例10: partitionBy
def partitionBy(self, *cols):
"""
Partitions the output by the given columns on the file system.
If specified, the output is laid out on the file system similar
to Hive's partitioning scheme.
:param cols: name of columns
"""
if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
cols = cols[0]
self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
return self
开发者ID:QuantiumTechnology,项目名称:spark,代码行数:12,代码来源:readwriter.py
示例11: parquet
def parquet(self, *paths):
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
You can set the following Parquet-specific option(s) for reading Parquet files:
* ``mergeSchema``: sets whether we should merge schemas collected from all \
Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
The default value is specified in ``spark.sql.parquet.mergeSchema``.
>>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
"""
return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
开发者ID:ElfoLiNk,项目名称:spark,代码行数:13,代码来源:readwriter.py
示例12: format_string
def format_string(format, *cols):
"""
Formats the arguments in printf-style and returns the result as a string column.
:param col: the column name of the numeric value to be formatted
:param d: the N decimal places
>>> df = sqlContext.createDataFrame([(5, "hello")], ['a', 'b'])
>>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
[Row(v=u'5 hello')]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))
开发者ID:EugenCepoi,项目名称:spark,代码行数:13,代码来源:functions.py
示例13: least
def least(*cols):
"""
Returns the least value of the list of column names, skipping null values.
This function takes at least 2 parameters. It will return null iff all parameters are null.
>>> df = sqlContext.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
>>> df.select(least(df.a, df.b, df.c).alias("least")).collect()
[Row(least=1)]
"""
if len(cols) < 2:
raise ValueError("least should take at least two columns")
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column)))
开发者ID:EugenCepoi,项目名称:spark,代码行数:13,代码来源:functions.py
示例14: agg
def agg(self, *exprs):
"""Compute aggregates and returns the result as a :class:`DataFrame`.
The available aggregate functions can be:
1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`
2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`
.. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
a full shuffle is required. Also, all the data of a group will be loaded into
memory, so the user should be aware of the potential OOM risk if data is skewed
and certain groups are too large to fit in memory.
.. seealso:: :func:`pyspark.sql.functions.pandas_udf`
If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
is the column to perform aggregation on, and the value is the aggregate function.
Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.
.. note:: Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
in a single call to this function.
:param exprs: a dict mapping from column name (string) to aggregate functions (string),
or a list of :class:`Column`.
>>> gdf = df.groupBy(df.name)
>>> sorted(gdf.agg({"*": "count"}).collect())
[Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]
>>> from pyspark.sql import functions as F
>>> sorted(gdf.agg(F.min(df.age)).collect())
[Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
>>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP
... def min_udf(v):
... return v.min()
>>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP
[Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]
"""
assert exprs, "exprs should not be empty"
if len(exprs) == 1 and isinstance(exprs[0], dict):
jdf = self._jgd.agg(exprs[0])
else:
# Columns
assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
jdf = self._jgd.agg(exprs[0]._jc,
_to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
return DataFrame(jdf, self.sql_ctx)
开发者ID:BaiBenny,项目名称:spark,代码行数:51,代码来源:group.py
示例15: partitionBy
def partitionBy(self, *cols):
"""Partitions the output by the given columns on the file system.
If specified, the output is laid out on the file system similar
to Hive's partitioning scheme.
:param cols: name of columns
>>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
"""
if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
cols = cols[0]
self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
return self
开发者ID:ChenZhongPu,项目名称:Simba,代码行数:14,代码来源:readwriter.py
示例16: struct
def struct(*cols):
"""Creates a new struct column.
:param cols: list of column names (string) or list of :class:`Column` expressions
>>> df.select(struct('age', 'name').alias("struct")).collect()
[Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
>>> df.select(struct([df.age, df.name]).alias("struct")).collect()
[Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
"""
sc = SparkContext._active_spark_context
if len(cols) == 1 and isinstance(cols[0], (list, set)):
cols = cols[0]
jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
return Column(jc)
开发者ID:deanwampler,项目名称:spark,代码行数:15,代码来源:functions.py
示例17: pivot
def pivot(self, pivot_col, *values):
"""Pivots a column of the current DataFrame and preform the specified aggregation.
:param pivot_col: Column to pivot
:param values: Optional list of values of pivotColumn that will be translated to columns in
the output data frame. If values are not provided the method with do an immediate call
to .distinct() on the pivot column.
>>> df4.groupBy("year").pivot("course", "dotNET", "Java").sum("earnings").collect()
[Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
>>> df4.groupBy("year").pivot("course").sum("earnings").collect()
[Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]
"""
jgd = self._jdf.pivot(_to_java_column(pivot_col),
_to_seq(self.sql_ctx._sc, values, _create_column_from_literal))
return GroupedData(jgd, self.sql_ctx)
开发者ID:archit279thakur,项目名称:spark,代码行数:15,代码来源:group.py
示例18: array
def array(*cols):
"""Creates a new array column.
:param cols: list of column names (string) or list of :class:`Column` expressions that have
the same data type.
>>> df.select(array('age', 'age').alias("arr")).collect()
[Row(arr=[2, 2]), Row(arr=[5, 5])]
>>> df.select(array([df.age, df.age]).alias("arr")).collect()
[Row(arr=[2, 2]), Row(arr=[5, 5])]
"""
sc = SparkContext._active_spark_context
if len(cols) == 1 and isinstance(cols[0], (list, set)):
cols = cols[0]
jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column))
return Column(jc)
开发者ID:deanwampler,项目名称:spark,代码行数:16,代码来源:functions.py
示例19: freqItems
def freqItems(self, cols, support=None):
"""
Finding frequent items for columns, possibly with false positives. Using the
frequent element count algorithm described in
"http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
strings.
:param support: The frequency with which to consider an item 'frequent'. Default is 1%.
The support must be greater than 1e-4.
"""
if isinstance(cols, tuple):
cols = list(cols)
if not isinstance(cols, list):
raise ValueError("cols must be a list or tuple of column names as strings.")
if not support:
support = 0.01
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
开发者ID:ZhangQingcheng,项目名称:spark,代码行数:19,代码来源:dataframe.py
示例20: randomSplit
def randomSplit(self, weights, seed=None):
"""Randomly splits this :class:`DataFrame` with the provided weights.
:param weights: list of doubles as weights with which to split the DataFrame. Weights will
be normalized if they don't sum up to 1.0.
:param seed: The seed for sampling.
>>> splits = df4.randomSplit([1.0, 2.0], 24)
>>> splits[0].count()
1
>>> splits[1].count()
3
"""
for w in weights:
if w < 0.0:
raise ValueError("Weights must be positive. Found weight value: %s" % w)
seed = seed if seed is not None else random.randint(0, sys.maxsize)
rdd_array = self._jdf.randomSplit(_to_seq(self.sql_ctx._sc, weights), long(seed))
return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
开发者ID:ZhangQingcheng,项目名称:spark,代码行数:20,代码来源:dataframe.py
注:本文中的pyspark.sql.column._to_seq函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论