spark MLlib BasicStatistics 统计学基础
一, jar依赖,jsc创建。
package ML.BasicStatistics; import com.google.common.collect.Lists; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.DoubleFlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.mllib.linalg.Matrices; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.stat.KernelDensity; import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.ChiSqTestResult; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.rdd.RDD; import scala.Tuple2; import scala.runtime.Statics; import static org.apache.spark.mllib.random.RandomRDDs.*; import java.util.*; /** * TODO * * @ClassName: BasicStatistics * @author: DingH * @since: 2019/4/3 16:11 */ public class BasicStatistics { public static void main(String[] args) { System.setProperty("hadoop.home.dir","E:\\hadoop-2.6.5"); SparkConf conf = new SparkConf().setAppName("BasicStatistics").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf);
二。Summary statistics
/** * @Title: Statistics.colStats一个实例MultivariateStatisticalSummary,其中包含按列的max,min,mean,variance和非零数,以及总计数 * Summary statistics:摘要统计 */ JavaRDD<Vector> parallelize = jsc.parallelize(Arrays.asList( Vectors.dense(1, 0, 3), Vectors.dense(2, 0, 4), Vectors.dense(3, 0, 5) )); MultivariateStatisticalSummary summary = Statistics.colStats(parallelize.rdd()); System.out.println(summary.mean()); System.out.println(summary.variance()); System.out.println(summary.numNonzeros());
三。Correlations:相关性
/** * @Title: Correlations:相关性 */ JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList( new Tuple2<String, String>("cat", "11"), new Tuple2<String, String>("dog", "22"), new Tuple2<String, String>("cat", "33"), new Tuple2<String, String>("pig", "44") )); JavaDoubleRDD seriesX = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() { public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception { ArrayList<Double> strings = new ArrayList<Double>(); while (tuple2Iterator.hasNext()){ strings.add(Double.parseDouble(tuple2Iterator.next()._2)); } return strings; } }); JavaDoubleRDD seriesY = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() { public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception { ArrayList<Double> strings = new ArrayList<Double>(); while (tuple2Iterator.hasNext()){ strings.add(Double.parseDouble(tuple2Iterator.next()._2)+1); } return strings; } }); //compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a //method is not specified, Pearson's method will be used by default. double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); JavaRDD<Vector> parallelize11 = jsc.parallelize(Arrays.asList( Vectors.dense(1, 0, 3), Vectors.dense(2, 0, 4), Vectors.dense(3, 0, 5) ));// note that each Vector is a row and not a column Matrix correlation2 = Statistics.corr(parallelize11.rdd(), "spearman"); System.out.println(correlation2);
三,Stratified sampling:分层抽样
/** * @Title: Stratified sampling:分层抽样 */ JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList( new Tuple2<String, String>("cat", "11"), new Tuple2<String, String>("dog", "22"), new Tuple2<String, String>("cat", "33"), new Tuple2<String, String>("pig", "44") )); JavaPairRDD data = parallelize.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() { public Tuple2<String, String> call(Tuple2<String, String> stringStringTuple2) throws Exception { return new Tuple2<String, String>(stringStringTuple2._1, stringStringTuple2._2); } }); // an RDD of any key value pairs Map<String, Double> fractions = new HashMap<String, Double>(); // specify the exact fraction desired from each key fractions.put("cat",0.5); //对于每个key取值的概率 fractions.put("dog",0.8); fractions.put("pig",0.8); // Get an exact sample from each stratum JavaPairRDD approxSample = data.sampleByKey(false, fractions); JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); approxSample.foreach(new VoidFunction() { public void call(Object o) throws Exception { System.out.println(o); } });
四。Hypothesis testing 假设检验
/** * @Title: Hypothesis testing 假设检验 */ Vector vec = Vectors.dense(1,2,3,4); // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, // the test runs against a uniform distribution. ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); // summary of the test including the p-value, degrees of freedom, test statistic, the method used, // and the null hypothesis. System.out.println(goodnessOfFitTestResult); Matrix mat = Matrices.dense(3,2,new double[]{1,2,3,4,5,6}); // a contingency matrix // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult); JavaRDD<LabeledPoint> obs = MLUtils.loadLibSVMFile(jsc.sc(), "/data...").toJavaRDD(); // an RDD of labeled points // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); int i = 1; for (ChiSqTestResult result : featureTestResults) { System.out.println("Column " + i + ":"); System.out.println(result); // summary of the test i++; } JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0,0.3)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data,"norm"); // summary of the test including the p-value, test statistic, // and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult);
五。Random data generation
/** * @Title: Random data generation :uniform, standard normal, or Poisson. */ JavaDoubleRDD u = normalJavaRDD(jsc, 100,2); // Apply a transform to get a random double RDD following `N(1, 4)`. JavaRDD<Double> map = u.map(new Function<Double, Double>() { public Double call(Double aDouble) throws Exception { return 1.0 + 2.0 * aDouble; } }); map.foreach(new VoidFunction<Double>() { public void call(Double aDouble) throws Exception { System.out.println(aDouble); } });
六。Kernel density estimation
/** * @Title: Kernel density estimation */ JavaRDD<Double> data = jsc.parallelize(Arrays.asList(1.0, 2.0, 3.0));// an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels KernelDensity kd = new KernelDensity() .setSample(data) .setBandwidth(3.0); // Find density estimates for the given values double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); for (int i = 0; i < densities.length; i++) { System.out.println(densities[i]); }