spark hadoop snappy test

from pyspark.sql import SparkSession
# from pyspark import SparkConf, SparkContext


def init_spark():
# conf = SparkConf().setAppName("normal spark")
# .setMaster("local")
# .set("spark.io.compression.codec", "org.apache.spark.io.SnappyCompressionCodec")
# .set("spark.driver.extraLibraryPath", "D:\app\spark-2.4.7-bin-hadoop2.7\jars")
# .set("spark.driver.extraClassPath", "D:\app\spark-2.4.7-bin-hadoop2.7\jars")
# .set("spark.executor.extraLibraryPath", "D:\app\spark-2.4.7-bin-hadoop2.7\jars")
# .set("spark.executor.extraClassPath", "D:\app\spark-2.4.7-bin-hadoop2.7\jars")
#
# sc = SparkContext(conf=conf)

spark = SparkSession.builder.appName("HelloWorld").getOrCreate()
sc = spark.sparkContext
return spark,sc

def main():
spark,sc = init_spark()
# nums = sc.parallelize([1,2,3,4])
rdd = sc.textFile("file:///D:\data\web_learning\0118\metadatas192.168.10.3_20210118_0000.snappy")
# rdd = sc.textFile("file:///D:\data\test.txt")
print("*"*88)
# print(nums.map(lambda x: x*x).collect())
print(rdd.collect())
print("*"*88)


if __name__ == '__main__':
main()
# System.setProperty("hadoop.home.dir", "D:\app\hadoop-2.7.3")
# > D:appspark - 2.4.7 - bin - hadoop2.7inspark - submit - Djava.library.path = D:apphadoop-2.7.3in web_learning.py
原文地址:https://www.cnblogs.com/bonelee/p/14334280.html