spark算子(二)

1.collect算子

*使用foreachACTION操作 ,collect在远程集群中遍历RDD的元素

*使用collect操作,将分布式在远程集群中的数据拉取到本地
*这种方式不建议使用,如果数据量大,会使用大量 的网络带宽
*这种方式不建议使用。

package kw.test.action;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;

public class Collect {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
		JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
		JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() {

			@Override
			public Iterator<String> call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				//首选将数据分割,然后将和数据值机型压扁。
				return Arrays.asList(arg0.split(" ")).iterator();
			}
		});
		List<String> str = result.collect();
		for(String s : str)
		{
			System.out.println(s);
		}
		
	}
}

 2.count

* 这个是一个action,他是有返回值的。

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

/*
 * 这个是一个action,他是有返回值的。
 */
public class count {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<Integer> list = Arrays.asList(1,2,3,4,5,6,7,8,9);
        JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
        long num = javaRDD.count();
        System.out.println(num);
    }

}

3.filter

*将返回true的数据进行输出

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

public class Filter {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("Filter").setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<Integer> list = Arrays.asList(1,2,3,4,5,6);
        JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
        JavaRDD<Integer> redult = javaRDD.filter(new Function<Integer, Boolean>() {

            @Override
            public Boolean call(Integer arg0) throws Exception {
                // TODO Auto-generated method stub
                return arg0%2==0;//返回的为true的就可以了。
            }
        });
        redult.foreach(new VoidFunction<Integer>() {
            
            @Override
            public void call(Integer arg0) throws Exception {
                // TODO Auto-generated method stub
                System.out.println(arg0);
            }
        });
    }
}

4.coalesce

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
/*
 * 此参数将返回一个RDD,分区的参数会别为设置的哪一个参数的个数
 * 
 * 这个返回是一个窄依赖,如果将一个100的变为10个partition的时候,这个时候
 * 不会进行shuffle运算,
 * 
 * 如果将好多个partition变为一个的时候,这个时候需要使用shuffle,他会进行shuffle的传递
 * 一般是较少使用,因为需要将数据进行传递。
 * 
 * 
 * 
 * 不仅仅是将partition变得更少,同时也可以将partition变的更大,这个时候需要将shuffle变为true
 * 如果将partition变的更多的时候,也是需要将将shuffle设置为true的。
 */


public class FilterOperter {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<Integer> list = Arrays.asList(1,2,3,4,5,6);
		JavaRDD<Integer> numbers = javaSparkContext.parallelize(list,6);
		//Colesce算子,在执行filter之后,有的partition上的数据就会变得很少,容易造成数据的倾斜
		JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {

			@Override
			public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
					throws Exception {
				// TODO Auto-generated method stub
				List<String> list = new ArrayList<String>();
				while(arg1.hasNext())
				{
					list.add(arg0+"      "+arg1.next());
				}
				return list.iterator();
			}
		}, true);
		
		result.foreach(new VoidFunction<String>() {

			@Override
			public void call(String arg1) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg1+"    ");
			}
			
		});
		JavaRDD<String> javaCoaxlesce = result.coalesce(3);
		JavaRDD<String> coalesce = javaCoaxlesce.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

			@Override
			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
					throws Exception {
				// TODO Auto-generated method stub
				List<String> list = new ArrayList<String>();
				while(arg1.hasNext())
				{
					list.add(arg1.next()+"      "+arg0);
					
				}
				return list.iterator();
			}
		}, true);//参数的含义:true的含义就是是否进行shuffle。默认是不进行shuffle
		coalesce.foreach(new VoidFunction<String>() {

			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println("   "+ arg0);
			}
		});
	}
		

}

 5.floatmap

package kw.test.action;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

/*
 * 先执行一个map操作,然后将数据flat压扁
 */
public class FlatMap {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("FlatMap").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<String> list = Arrays.asList("wo am xues","ni shi la ji ","zha zs","zha df");
		JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
		JavaRDD<String> result = javaRDD.flatMap(new FlatMapFunction<String, String>() {

			@Override
			public Iterator<String> call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				//首选将数据分割,然后将和数据值机型压扁。
				return Arrays.asList(arg0.split(" ")).iterator();
			}
		});
		result.foreach(new VoidFunction<String>() {

			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0);
			}
		});
		
		
	}

}

6.mappartition

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

public class MapPartition {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("MapPartition").setMaster("local");
		JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
		List<String> list = Arrays.asList("wo","shi","lal","woowj","dffdds");
		JavaRDD<String> javaRDD = sparkContext.parallelize(list);
		

		JavaRDD<String> result = javaRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {

			//将partition的数据全部给map,然后使用迭代器处理
			@Override
			public Iterator<String> call(Iterator<String> arg0)throws Exception {
				// TODO Auto-generated method stub
			   List<String> list = new ArrayList<String>();
				while(arg0.hasNext() )
				{
					list.add(arg0.next());
				}
				return  list.iterator();
			}
		});
		result.foreach(new VoidFunction<String>() {

			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0);
			}});

		 
	}

}

  7.MapPartitionsWithIndex

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;


/*
 *k可以获取到里面的分区位置
 */
public class MapPartitionsWithIndex {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("MapPartitionsWithIndex").setMaster("local");
		JavaSparkContext sparkContext =new JavaSparkContext(sparkConf);
		
		List<String> list = Arrays.asList("kang","wang","ddd","kang1","wang2","ddd3");
		JavaRDD<String> javaRDD = sparkContext.parallelize(list,4);
		
		JavaRDD<String> javaRDD2 = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

			@Override
			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
					throws Exception {
				// TODO Auto-generated method stub
				List<String> list = new ArrayList<String>();
				while(arg1.hasNext())
				{
					list.add(arg1.next()+"  "+ arg0);
				}
				return list.iterator();
			}
		}, true);
		javaRDD2.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0);
			}
		});
	}
}

  8.Reduce 

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

public class Reduce {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<Integer> list= Arrays.asList(1,2,3,4,5,6,7);   
		JavaRDD<Integer> num = javaSparkContext.parallelize(list);
		int sum = num.reduce(new Function2<Integer, Integer, Integer>() {
			
			@Override
			public Integer call(Integer arg0, Integer arg1) throws Exception {
				// TODO Auto-generated method stub
				return arg0+arg1;
			}
		});
		System.out.println("最终结果:"+sum);
	}

}

9.ReduceByKey

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/*
 * ReduceByKey是一个shuffle操作
 * 
 * 在shuffle的时候,分为map端和reduce端
 * 
 * spark里面的reduceByKey在map端映带conbiner。
 * 也就是在map中处理然后将其累加,减少了网络的传输,效率更高。
 */
public class ReduceByKey {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("Reduce").setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<Tuple2<String, Integer>> list= Arrays.asList(new Tuple2<String,Integer>("kang",100),
                new Tuple2<String,Integer>("kang",100),
                new Tuple2<String,Integer>("kang",100),
                new Tuple2<String,Integer>("kang",100),
                new Tuple2<String,Integer>("wang",100));
        JavaPairRDD<String, Integer> num = javaSparkContext.parallelizePairs(list);
        JavaPairRDD<String, Integer> rr = num.reduceByKey(new Function2<Integer, Integer, Integer>() {
            
            @Override
            public Integer call(Integer arg0, Integer arg1) throws Exception {
                // TODO Auto-generated method stub
                return arg0+arg1;
            }
        });
        rr.foreach(new VoidFunction<Tuple2<String,Integer>>() {
            
            @Override
            public void call(Tuple2<String, Integer> arg0) throws Exception {
                // TODO Auto-generated method stub
                System.out.println(arg0);
            }
        });
    }

}

10.RePartition

package kw.test.action;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;


/*
 * repartition算子,用于任意数据量RDD的partition增多或者减少。
 * coalesce仅仅将RDD的数量减少【其实不是这样的】
 * 、
 * 
 * 建议使用的场景:
 * 一个很经典的场景,使用spark SQL从HIVE中查询数据的时候,spark SQLhui 
 * 会根据HIVE对应的hdfs文件的block的数量决定加载出来的RDD的个数是
 * 多少个,这里默认的partition的数量是我们根本无法设置的
 * 
 * 
 * 有的时候,可能他会自动的设置partition的数量过于少,就进行优化
 * 可以提高并发度,就是对RDD使用的partition的算子。
 * 
 * 
 * 一般情况下,我们为了减少shuffle的时候,我们首选使用coalesce
 * 因为他可以避免shuffle操作。
 * 
 * 
 */
public class RePartition {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("Coalesce").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<Integer> list = Arrays.asList(1,2,3,4,5,6);
		
		JavaRDD<Integer> numbers = javaSparkContext.parallelize(list);
		
		JavaRDD<String> result = numbers.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {

			@Override
			public Iterator<String> call(Integer arg0, Iterator<Integer> arg1)
					throws Exception {
				// TODO Auto-generated method stub
			
				List<String> list = new ArrayList<String>();
				while(arg1.hasNext())
				{
					list.add(arg1.next()+"      "+arg0);
				}
				return list.iterator();
			}
		}, true);
		result.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0+"           ");
			}
		});
		

		result.repartition(5);
		JavaRDD<String> result2 = result.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

			@Override
			public Iterator<String> call(Integer arg0, Iterator<String> arg1)
					throws Exception {
				// TODO Auto-generated method stub
				List<String> list = new ArrayList<String>();
				while(arg1.hasNext())
				{
					list.add(arg1.next()+"        "+arg0);
				}
				return list.iterator();
			}
		
		}, true);
		result2.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0+"           ");
			}
		});
	}

}

11.Sample

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

/*
 * 随机采样从,可以传入一个Float,不如数0.3就是采样30%
 * 比如说我在测试的时候使用的是5个单词,最后也就显示的是1个,当然了 Float的值是可以修改的,参数的个数也是可以自己给定的。
 * 
 * 这个也可以设置partition的个数,如果是一个就先手wordnum*float,如果是两个的时候,他们的值是每一个里面取出wordNum*float的值
 */
public class Sample {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("sample").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<String> list = Arrays.asList("kwang","wang","is","a","student");
		JavaRDD<String> num = javaSparkContext.parallelize(list);
	    //参数:false就是抽取之后,不会将数据放回,最终的结果是不会有重复的,如果是true会将数据放回再次抽取,最终结果就会有重复的。
		//它的第三个参数,种子,如果不指定就会自动产生一个种子,所以每次的结果都是不一样的,但是如果指定一个种子
		//那么它的最终的结果都是一样的。此参数一般作为测试使用。
		num.sample(false, 0.8).foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0);
			}
		});
	}

}

12.Take

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

/*
 * take是取数据,参数是几,我们就取其中的几个参数出来
 */
public class Take {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("take").setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<String> list = Arrays.asList("kang","wang","lala");
        JavaRDD<String> result = javaSparkContext.parallelize(list);
        /*
         *         List<String> list1 = Arrays.asList("kang","wang","lala");
        List<Integer> list = Arrays.asList(1,2,3,4,5);
        JavaRDD<Integer> result = javaSparkContext.parallelize(list);
        JavaRDD<String> result1 = javaSparkContext.parallelize(list1);
        result.take(2);
        result1.take(num);
        
         */
        List<String> name = result.take(2);
        for(String value :name)
        {
            System.out.println(value);
        }
    }

}

13.TakeSample

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

/*
 * take是取出数据,sample是随机的取出数据
 * 
 * takeSample先进行sample在进行采样
 */
public class TakeSample {
	public static void main(String[] args) {
		SparkConf sparkConf = new SparkConf().setAppName("takesample").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<String> list = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2","kang3","wang3","lala3");
		JavaRDD<String> result = javaSparkContext.parallelize(list);
		//第一个参数是否放回,第二个参数是数据的个数,第三个参数是种子。  如果种子一样,每一次的数据都是一样的,如果种子不一样,每一次的参数都不一样。
		List<String> value = result.takeSample(false, 3);
		for(String v:value)
		{
			System.out.println(v);
		}
	}
}

14.Union

package kw.test.action;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

/*
 * 将两个list变为一个list,他不会进行shuffle操作,加入开始都是两个partition,
 * 将他们union之后,会是四个
 */
public class Union {
	public static void main(String[] args) {
		

		// TODO Auto-generated method stub
		SparkConf sparkConf = new SparkConf().setAppName("union").setMaster("local");
		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
		List<String> list1 = Arrays.asList("kang1","wang1","lala1","kang2","wang2","lala2");
		List<String> list2 = Arrays.asList("kang3","wang3","lala3");
		JavaRDD<String> result1 = javaSparkContext.parallelize(list1,2);
		JavaRDD<String> result2 = javaSparkContext.parallelize(list2,2);
		JavaRDD <String> unionre = result1.union(result2);
		unionre.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String arg0) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(arg0);
			}
		});
	}
}

  

原文地址:https://www.cnblogs.com/kw28188151/p/8722364.html