pyspark一些简单常用的函数方法

1、将一个字符或数字列转换为vector/array

from pyspark.sql.functions import col,udf
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT, DenseVector

# 数字的可转为vector,但字符串转为vector会报错
to_vec = udf(lambda x: DenseVector([x]), VectorUDT())

# 字符串转为array
to_array = udf(lambda x: [x], ArrayType(StringType()))

2、从一个向量或数组列中获取某个位置处的值

df = spark.createDataFrame([(1, [1,2,3]), (2, [4,5,6])], ['label', 'data'])
df.show()
df.printSchema()

+-----+---------+
|label|     data|
+-----+---------+
|    1|[1, 2, 3]|
|    2|[4, 5, 6]|
+-----+---------+

root
 |-- label: long (nullable = true)
 |-- data: array (nullable = true)
 |    |-- element: long (containsNull = true)

# 可以根据某一列的值作为索引来选择特定位置的值
from pyspark.sql.functions import udf,col
from pyspark.sql.types import FloatType

firstelement=udf(lambda k,v:float(v[int(k)]),FloatType())
df.withColumn('value', firstelement(col('label'), col('data'))).show(4, truncate=False)
+-----+---------+-----+
|label|data     |value|
+-----+---------+-----+
|1    |[1, 2, 3]|2.0  |
|2    |[4, 5, 6]|6.0  |
+-----+---------+-----+ 

3、单个list列变多列

参考https://stackoverflow.com/questions/45789489/how-to-split-a-list-to-multiple-columns-in-pyspark

4、获取每个类别的前n个数据

参考:https://stackoverflow.com/questions/38397796/retrieve-top-n-in-each-group-of-a-dataframe-in-pyspark

rdd = sc.parallelize([("user_1",  "object_1",  3), 
                      ("user_1",  "object_2",  2), 
                      ("user_2",  "object_1",  5), 
                      ("user_2",  "object_2",  2), 
                      ("user_2",  "object_2",  6)])
df = sqlContext.createDataFrame(rdd, ["user_id", "object_id", "score"])

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window = Window.partitionBy(df['user_id']).orderBy(df['score'].desc())

df.select('*', rank().over(window).alias('rank')) 
  .filter(col('rank') <= 2) 
  .show() 
#+-------+---------+-----+----+
#|user_id|object_id|score|rank|
#+-------+---------+-----+----+
#| user_1| object_1|    3|   1|
#| user_1| object_2|    2|   2|
#| user_2| object_2|    6|   1|
#| user_2| object_1|    5|   2|
#+-------+---------+-----+----+

5、将字符串转变为日期

from pyspark.sql.functions import min, max

df = spark.createDataFrame([
  "2017-01-01", "2018-02-08", "2019-01-03", "2019-01-01"], "string"
).selectExpr("CAST(value AS date) AS date")

# 或者
df = spark.createDataFrame([
  "2017-01-01", "2018-02-08", "2019-01-03", "2019-01-01"], "string"
).withColumn('date', F.col('value').cast(DateType())) 

min_date, max_date = df.select(min("date"), max("date")).first()
min_date, max_date

# (datetime.date(2017, 1, 1), datetime.date(2019, 1, 3))

6、检测缺失值

d1 = spark.createDataFrame([(10,'a',None), (20, 'b',3), (30, 'c',4), 
                            (20, 'b',5), (30, 'd',6), (40, None,7), 
                            (None, 'e',8)], ['value', 'key','v2'])
d1 = d1.select('key', 'value', 'v2')

d1.where(reduce(lambda x, y: x | y, (F.col(x).isNull() for x in d1.columns))).show()

# 或者
d1.where(F.col('key').isNull() | F.col('value').isNull() | F.col('v2').isNull()).show()

# +----+-----+----+
# | key|value|  v2|
# +----+-----+----+
# |   a|   10|null|
# |null|   40|   7|
# |   e| null|   8|
# +----+-----+----+

更进一步地,同时还检测NaN和空字符串:

d1 = spark.createDataFrame([(10,'a', None), (20, 'b', 3.), (30, 'c',4.), 
                            (20, 'b',5.), (30, 'd', np.nan), (40, None,7.), 
                            (None, 'e',8.), (50, '', 8.)], ['value', 'key','v2'])
d1 = d1.select('key', 'value', 'v2')
d1.show()
# +----+-----+----+
# | key|value|  v2|
# +----+-----+----+
# |   a|   10|null|
# |   b|   20| 3.0|
# |   c|   30| 4.0|
# |   b|   20| 5.0|
# |   d|   30| NaN|
# |null|   40| 7.0|
# |   e| null| 8.0|
# |    |   50| 8.0|
# +----+-----+----+

d1.where((F.col('key').isNotNull()) & (F.col('key')!='') & (~F.isnan(F.col('v2')))).show()
# +---+-----+----+
# |key|value|  v2|
# +---+-----+----+
# |  a|   10|null|
# |  b|   20| 3.0|
# |  c|   30| 4.0|
# |  b|   20| 5.0|
# |  e| null| 8.0|
# +---+-----+----+

7、填充非连续时间序列

参考:https://walkenho.github.io/interpolating-time-series-p2-spark/
https://stackoverflow.com/questions/42411184/filling-gaps-in-timeseries-spark

或者 https://stackoverflow.com/questions/39271374/pyspark-how-to-resample-frequencies

import random

data = {'readtime' : pd.date_range(start='1/15/2018', end='02/14/2018', freq='D')\
                       .append(pd.date_range(start='1/15/2018', end='02/14/2018', freq='D')),
        'house' : ['house1' for i in range(31)] + ['house2' for i in range(31)],
        'readvalue': [0.5+0.5*np.sin(2*np.pi/30*i) for i in range(31)]\
                     + [0.5+0.5*np.cos(2*np.pi/30*i) for i in range(31)]}
df0 = pd.DataFrame(data, columns = ['readtime', 'house', 'readvalue'])
random.seed(42)
df0 = df0.drop(random.sample(range(df0.shape[0]), k=int(df0.shape[0]/2)))
df0.head()

    readtime    house   readvalue
2   2018-01-17  house1  0.703368
5   2018-01-20  house1  0.933013
7   2018-01-22  house1  0.997261
8   2018-01-23  house1  0.997261
14  2018-01-29  house1  0.603956

import pyspark.sql.functions as func
from pyspark.sql.functions import col
df = spark.createDataFrame(df0)
df = df.withColumn("readtime", col('readtime')/1e9)\
       .withColumn("readtime_existent", col("readtime"))
df.show(10, False)

+-----------+------+--------------------+-----------------+
|readtime   |house |readvalue           |readtime_existent|
+-----------+------+--------------------+-----------------+
|1.5161472E9|house1|0.7033683215379001  |1.5161472E9      |
|1.5164064E9|house1|0.9330127018922193  |1.5164064E9      |
|1.5165792E9|house1|0.9972609476841366  |1.5165792E9      |
|1.5166656E9|house1|0.9972609476841368  |1.5166656E9      |
|1.517184E9 |house1|0.6039558454088798  |1.517184E9       |
|1.5172704E9|house1|0.5000000000000001  |1.5172704E9      |
|1.5174432E9|house1|0.2966316784621001  |1.5174432E9      |
|1.5175296E9|house1|0.2061073738537635  |1.5175296E9      |
|1.5177024E9|house1|0.06698729810778081 |1.5177024E9      |
|1.5177888E9|house1|0.024471741852423234|1.5177888E9      |
+-----------+------+--------------------+-----------------+
only showing top 10 rows

三步实现填充时间gap:

  • In the first step, we group the data by ‘house’ and generate an array containing an equally spaced time grid for each house.

  • In the second step, we create one row for each element of the arrays by using the spark SQL function explode().

  • In the third step, the resulting structure is used as a basis to which the existing read value information is joined using an outer left join.

from pyspark.sql.types import *

# define function to create date range
def date_range(t1, t2, step=60*60*24):
    """Return a list of equally spaced points between t1 and t2 with stepsize step."""
    return [t1 + step*x for x in range(int((t2-t1)/step)+1)]

# define udf
date_range_udf = func.udf(date_range, ArrayType(LongType()))

# obtain min and max of time period for each house
df_base = df.groupBy('house')\
            .agg(func.min('readtime').cast('integer').alias('readtime_min'), 
                 func.max('readtime').cast('integer').alias('readtime_max'))

# generate timegrid and explode
df_base = df_base.withColumn("readtime", func.explode(date_range_udf("readtime_min", "readtime_max")))\
             .drop('readtime_min', 'readtime_max')

# left outer join existing read values
df_all_dates = df_base.join(df, ["house", "readtime"], "leftouter")

tmp = df_all_dates.withColumn('readtime', func.from_unixtime(col('readtime')))
tmp.orderBy('house','readtime').show(20, False)

+------+-------------------+--------------------+-----------------+
|house |readtime           |readvalue           |readtime_existent|
+------+-------------------+--------------------+-----------------+
|house1|2018-01-17 08:00:00|0.7033683215379001  |1.5161472E9      |
|house1|2018-01-18 08:00:00|null                |null             |
|house1|2018-01-19 08:00:00|null                |null             |
|house1|2018-01-20 08:00:00|0.9330127018922193  |1.5164064E9      |
|house1|2018-01-21 08:00:00|null                |null             |
|house1|2018-01-22 08:00:00|0.9972609476841366  |1.5165792E9      |
|house1|2018-01-23 08:00:00|0.9972609476841368  |1.5166656E9      |
|house1|2018-01-24 08:00:00|null                |null             |
|house1|2018-01-25 08:00:00|null                |null             |
|house1|2018-01-26 08:00:00|null                |null             |
|house1|2018-01-27 08:00:00|null                |null             |
|house1|2018-01-28 08:00:00|null                |null             |
|house1|2018-01-29 08:00:00|0.6039558454088798  |1.517184E9       |
|house1|2018-01-30 08:00:00|0.5000000000000001  |1.5172704E9      |
|house1|2018-01-31 08:00:00|null                |null             |
|house1|2018-02-01 08:00:00|0.2966316784621001  |1.5174432E9      |
|house1|2018-02-02 08:00:00|0.2061073738537635  |1.5175296E9      |
|house1|2018-02-03 08:00:00|null                |null             |
|house1|2018-02-04 08:00:00|0.06698729810778081 |1.5177024E9      |
|house1|2018-02-05 08:00:00|0.024471741852423234|1.5177888E9      |
+------+-------------------+--------------------+-----------------+
only showing top 20 rows

8、当前行与上一行值得差

参考:https://www.arundhaj.com/blog/calculate-difference-with-previous-row-in-pyspark.html

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window

sc = SparkContext(appName="PrevRowDiffApp")
sqlc = SQLContext(sc)

rdd = sc.parallelize([(1, 65), (2, 66), (3, 65), (4, 68), (5, 71)])

df = sqlc.createDataFrame(rdd, ["id", "value"])

my_window = Window.partitionBy().orderBy("id")

df = df.withColumn("prev_value", F.lag(df.value).over(my_window))
df = df.withColumn("diff", F.when(F.isnull(df.value - df.prev_value), 0)
                              .otherwise(df.value - df.prev_value))

df.show()
+---+-----+----------+----+
| id|value|prev_value|diff|
+---+-----+----------+----+
|  1|   65|      null|   0|
|  2|   66|        65|   1|
|  3|   65|        66|  -1|
|  4|   68|        65|   3|
|  5|   71|        68|   3|
+---+-----+----------+----+

9、udf如何传入多个参数来计算list column不同的quantile?

利用闭包:https://stackoverflow.com/questions/52843485/pyspark-pass-multiple-columns-along-with-an-argument-in-udf

def quantile_udf(q=0.5):
    """计算quantile
    """
    def quantile_(arr):
        return float(np.quantile(arr, q))
    return F.udf(quantile_, DoubleType())

history_stat = history_stat.withColumn('quantile_025', quantile_udf(0.25)(F.col('date_diff_list')))\
                        .withColumn('quantile_05', quantile_udf(0.5)(F.col('date_diff_list')))\
                        .withColumn('quantile_075', quantile_udf(0.75)(F.col('date_diff_list')))


history_stat.show(2)
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+
|cust_store_dkey|cust_product_dkey| first_day|  last_day|sale_times|max_sales_interval|mean_sales_interval|      date_diff_list|quantile_025|quantile_05|quantile_075|
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+
|            560|              211|2017-12-10|2019-03-08|       180|                16| 2.5166666666666666|[0, 11, 4, 11, 1,...|         1.0|        1.0|         3.0|
|            560|              990|2016-12-30|2017-03-17|        20|                26|               3.85|[0, 1, 1, 1, 2, 3...|         1.0|        2.0|         4.0|
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+

10、对特定条件的值进行替换

https://stackoverflow.com/questions/44773758/how-to-conditionally-replace-value-in-a-column-based-on-evaluation-of-expression

import numpy as np

df = spark.createDataFrame(
    [(1, 1, None),
     (1, 2, float(5)),
     (1, 3, np.nan),
     (1, 4, None),
     (0, 5, float(10)),
     (1, 6, float('nan')),
     (0, 6, float('nan'))],
    ('session', "timestamp1", "id2"))

+-------+----------+----+
|session|timestamp1| id2|
+-------+----------+----+
|      1|         1|null|
|      1|         2| 5.0|
|      1|         3| NaN|
|      1|         4|null|
|      0|         5|10.0|
|      1|         6| NaN|
|      0|         6| NaN|
+-------+----------+----+

from pyspark.sql.functions import when

targetDf = df.withColumn("timestamp1", \
              when(df["session"] == 0, 999).otherwise(df["timestamp1"]))

11、如何将sparse vector转化为array?

vector_udf = udf(lambda vector: vector.toArray().tolist(), ArrayType(FloatType()))

df = df.withColumn('col1', vector_udf('col2'))

需要注意的是,udf中的tolist() 是必须的, 因为spark中没有np.array类型。类似的,当我们返回一个np.dtype类型数据的时候,也需要使用floatint对其进行转换。

12、如何将pyspark sparse vector转化为 scipy sparse matrix以及pytorch sparse tensor?

参考:https://stackoverflow.com/questions/40557577/pyspark-sparse-vectors-to-scipy-sparse-matrix
将spark sparse vector转换为scipy csr matrix如下:

import numpy as np
from scipy.sparse import vstack
import numpy as np
from scipy.sparse import csr_matrix
import torch

def as_matrix(vec):
    data, indices = vec.values, vec.indices
    shape = 1, vec.size
    return csr_matrix((data, indices, np.array([0, vec.values.size])), shape)

# cv_cols表示spark中countvectorizer得到的稀疏矩阵
train_pd[csr_cols] = train_pd[cv_cols].applymap(lambda x: as_matrix(x))

# 上面的代码将每一行的spark sparse vector转换为了scipy csr matrix,
# 通过下面的代码可以将每一列的所有行csr matrix进行合并,得到一个大的csr matrix
csr_col1 = vstack(train_pd['csr_col1'])

通过上面的代码可以将sparse vector转换为scipy sparse matrix,具体地——scipy csr matrix。

下面我们再将scipy csr matrix转换为pytorch sparse tensor。

def sparse2tensor(tmpdf):
    """
    tmpdf 表示一个scipy csr matrix,如上面得到的csr_col1。
    """
    tmpdf_coo = vstack(tmpdf).tocoo()
    # 下面代码中的torch.Size能保证转换为sparse tensor后维度一致
    sptensor = torch.sparse.FloatTensor(torch.LongTensor([tmpdf_coo.row.tolist(), tmpdf_coo.col.tolist()]),
                                     torch.FloatTensor(tmpdf_coo.data), torch.Size(tmpdf_coo.shape))
    return sptensor

spt = sparse2tensor(csr_cols1)

当数据维度非常大且稀疏的时候,使用sparse matrix/tensor能极大的减少内存占用,是一个非常实用的方法。

12、稀疏向量求和

def sum_vector(vector):
    return float(vector.values.sum())

13、bigint 转timestamp

df = df.withColumn('ttl2', 
          Func.from_unixtime(col("ttl").cast('float') / 1000.0 , 'yyyyMMdd'))

13、大量category的string2index实现

在进行label encoder的时候,一般使用stringIndex函数。但是有时候由于类别数太多,达到几千万上亿,如对推荐的MF中ID的编码,此时仍然使用stringIndex容易内存溢出,因为这几千万个字符要进行序列化并传播需要较大的机器内存。这时候我们就需要使用其他的方法来进行编码了。

一种方法是借鉴window函数,以id作为key进行分window并对每个window赋一个值:

# 原数据如下:
id | col |
1  |  a  |
2  |  a  |
3  |  b  |
4  |  c  |
5  |  c  |

from pyspark.sql import window as W
import pyspark.sql.functions as f
df.select('id', f.dense_rank().over(W.Window.orderBy('col')).alias('group')).show(truncate=False)
# 结果如下:
+---+-----+
|id |group|
+---+-----+
|1  |1    |
|2  |1    |
|3  |2    |
|4  |3    |
|5  |3    |
+---+-----+

此外,还有一种方法是借用window的row_number函数,先给每一个唯一id编码,然后进行join:

df2 = data.select("col").distinct()
  .withColumn("group", row_number().over(Window.orderBy("col")))

result = data.join(df2,  on="col",  how="inner")

参考:https://stackoverflow.com/questions/50233518/create-a-group-id-over-a-window-in-spark-dataframe.

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 202,056评论 5 474
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 84,842评论 2 378
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 148,938评论 0 335
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,296评论 1 272
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,292评论 5 363
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,413评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 37,824评论 3 393
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,493评论 0 256
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,686评论 1 295
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,502评论 2 318
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,553评论 1 329
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,281评论 4 318
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 38,820评论 3 305
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,873评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,109评论 1 258
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 42,699评论 2 348
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,257评论 2 341