structured streaming

https://dzone.com/articles/spark-streaming-vs-structured-streaming 比较spark streaming 和structured streaming

1。微批处理模式

日志操作保证一致性带来微小延迟 100ms

2。持续处理模式

毫秒级延迟异步写日志

structured streaming 程序

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from  pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import explode
if __name__=="__main__":
    spark=SparkSession.builder.appName("StructuredNetworkWordCount").getOrCreate()
    #getOrcreate : get a sparksession object, if current process has one, then use that ,if there is not, then is there a gloval sparksession, if not ,then create one.

spark.sparkContext.setLogLevel("WARN")

#get rid of info 

lines=spark.readStream.format("socket")
        .option("host","localhost")
        .option("port",9999)
        .load()

words=lines.select(
        explode(
            split(lines.value,"")
            ).alias("word")
        )
wordCounts=words.groupBy("word").count()

query = wordCounts.writeStream.outputMode("complete")
        .format("console")
        .trigger(processingTime="8 seconds")
        .start()

query.awaitTermination()

spark-submit --master local StructuredNetworkWordCount.py yarn模式需要多台机器所以用local模式运行

但是只能运行10s内他的结果

File源

structuredStreamingFileSourceGenerator.py

import os 
import shutil
import random
import time

TEST_DATA_TEMP_DIR = '/tmp/'                 #创建一个文件夹在tmp下
TEST_DATA_DIR='/tmp/testdata'

ACTION_DEF=['login','logout','purchase']                    #三种状态
DISTRICT_DEF=['fujian','beijing','shanghai','guangzhou']
JSON_LINE_PATTERN='{{"eventTime":{},"action":"{}","district":"{}"}}
'    #转义字符 escape character

#testing, to judge whether the folder exist. if so delete old data, and build now folder
#判断文件是否存在，存在删除老数据

def test_setUp():
    if os.path.exists(TEST_DATA_DIR):
        shutil.rmtree(TEST_DATA_DIR,ignore_errors=True)
    os.mkdir(TEST_DATA_DIR)                                

def test_tearDown():
    if os.path.exists(TEST_DATA_DIR):
        shutil.rmtree(TEST_DATA_DIR,ignore_errors=True)   #实验做完了把文件删掉

#generate testing file
def write_and_move(filename,data):
    with open(TEST_DATA_TEMP_DIR+filename,"wt",encoding="utf-8") as f:  #创建好临时文件，移动到文件夹 
        f.write(data)                                    #wt代表文档，with open会自动关闭文件释放内存

    shutil.move(TEST_DATA_TEMP_DIR + filename,TEST_DATA_DIR+filename)


if __name__ == "__main__":
    test_setUp()

    for i in range(1000):
        filename='e-mail-{}.json'.format(i) #.format插入数据

        content=''
        rndcount=list(range(100))
        random.shuffle(rndcount)
        for _ in rndcount:
            content += JSON_LINE_PATTERN.format(str(int(time.time())),
                    random.choice(ACTION_DEF),
                    random.choice(DISTRICT_DEF))
        write_and_move(filename,content)

        time.sleep(1) #生成1000个文件，每个有100行数据

    test_tearDown()

第二步创建程序对数据进行统计

structuredStreamingFileCount.py

# -*- coding: UTF-8 -*-

import os
import shutil
from pprint import pprint

from pyspark.sql import SparkSession
from pyspark.sql.functions import window,asc
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import TimestampType,StringType

#define JSON file location as constant
TEST_DATA_DIR_SPARK='file:///tmp/testdata/'

if __name__ == "__main__":
    schema=StructType([StructField("eventTime",TimestampType(),True),
        StructField("action",StringType(),True),
        StructField("district",StringType(),True)])

    spark=SparkSession
            .builder
            .appName("StructuredPurchaseCount")
            .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    #设置流计算过程

    lines=spark.readStream
            .format("json")
            .schema(schema)
            .option("maxFilesPerTrigger",100)
            .load(TEST_DATA_DIR_SPARK)

    #定义窗口
    windowDuration = '1 minutes'

    windowedCounts=lines.filter("action='purchase'")               #只想统计购买人的数量
            .groupBy('district',window('eventTime',windowDuration))   #对这些人根据地区进行分组统计  窗口统计，时间间隔1分钟
            .count()
            .sort(asc('window'))   #根据窗口进行排序

    #启动流计算
    query=windowedCounts
            .writeStream
            .outputMode("complete")
            .format("console")
            .option('truncate','false')
            .trigger(processingTime="10 seconds")
            .start()

    query.awaitTermination()