A standard deviation shows how much variation exists in the data from the average.
Problem
Given a list of employee salary and the department ,determine the standard deviation and mean of salary of each department.
Here is a sample input data attached employee_info.csv
Input Data sample
First Name,Last Name,Job Titles,Department,Full or Part-Time,Salary or Hourly,Typical Hours,Annual Salary,Hourly Rate
dubert,tomasz ,paramedic i/c,fire,f,salary,,91080.00, edwards,tim p,lieutenant,fire,f,salary,,114846.00, elkins,eric j,sergeant,police,f,salary,,104628.00, estrada,luis f,police officer,police,f,salary,,96060.00, ewing,marie a,clerk iii,police,f,salary,,53076.00, finn,sean p,firefighter,fire,f,salary,,87006.00, fitch,jordan m,law clerk,law,f,hourly,35,,14.51
Spark code for finding standard deviation and mean on pairRDD
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.util.StatCounter; import scala.Tuple2; import scala.Tuple3; public class StandardDeviation { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("pattern").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaRDD<String> rdd = jsc.textFile("C:\\codebase\\scala-project\\inputdata\\employee"); JavaPairRDD<String, Double> pair = rdd.mapToPair(new PairFunction<String, String, Double>() { @Override public Tuple2<String, Double> call(String value) throws Exception { // TODO Auto-generated method stub String data = value.toString(); String[] field = data.split(",", -1); double salary = 0; if (null != field && field.length == 9 && field[7].length() > 0) { return new Tuple2<String, Double>(field[3], Double.parseDouble(field[7])); } return new Tuple2<String, Double>("Invalid", 0.0); } }); JavaPairRDD<String, StatCounter> output = pair.aggregateByKey(new StatCounter(), StatCounter::merge, StatCounter::merge); JavaRDD<Tuple3<String, Double, Double>> statistics = output .map(new Function<Tuple2<String, StatCounter>, Tuple3<String, Double, Double>>() { @Override public Tuple3<String, Double, Double> call(Tuple2<String, StatCounter> stats) throws Exception { // TODO Auto-generated method stub return new Tuple3<String, Double, Double>(stats._1(), stats._2().stdev(), stats._2().mean()); } }); for (Tuple3<String, Double, Double> string : statistics.collect()) { System.out.println(string._1() + " " + string._2() + " " + string._3()); } } }
Spark code for finding standard deviation and mean using dataframe and dateset
import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.mean; import static org.apache.spark.sql.functions.stddev_pop; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class SD { public static void main(String[] args) { SparkSession session = SparkSession.builder().appName("Test").master("local").getOrCreate(); Dataset<Row> dataset = session.read().option("inferSchema", "true").csv("Input Path").toDF("fn", "ln", "designation", "department", "emp_type", "full_hour", "NA", "salary", "NA2"); dataset.groupBy(col("department")).agg(stddev_pop("salary"),mean("salary")).show(100); } }
output
+--------------------+-------------------+------------------+ | department| stddev_pop(salary)| avg(salary)| +--------------------+-------------------+------------------+ | HUMAN RELATIONS|1.827656706625529E7| 4664366.823529412| | BUSINESS AFFAIRS| 17901.113404461048| 80446.425| | CULTURAL AFFAIRS| 19475.91814362522| 87048.90909090909| | CITY COUNCIL| 28000.689213568432| 63577.17206896553| | LICENSE APPL COMM| 0.0| 80568.0| | BOARD OF ELECTION| 25951.27149887194|56051.142857142855| | BUDGET & MGMT| 24634.589482742234| 93925.3953488372| | IPRA| 19972.289719324952| 94429.28571428571| | GENERAL SERVICES| 22016.97480310627| 83095.5283902439| | WATER MGMNT| 22585.087375362513| 89894.1118032787| | BUILDINGS| 16818.445717336188| 98864.83353383462| | BOARD OF ETHICS| 20455.039226312914| 94552.5| | STREETS & SAN| 20351.953600368674| 84347.77570093458| | AVIATION| 23050.82432015936| 76140.01877697841| | POLICE| 16611.270671190465| 87836.02534889111| | FAMILY & SUPPORT| 17429.109374489333| 79013.58878504673| | ANIMAL CONTRL| 19315.766009450825| 66089.68421052632| | TREASURER| 22907.562092231725| 88062.65217391304| | ADMIN HEARNG| 21817.966626935515| 78912.94736842105| | HEALTH| 21213.118163927607| 85488.2109375| | LAW| 25666.418273342595| 84582.81440443214| | DISABILITIES| 21083.999906607645| 82431.72413793103| | POLICE BOARD| 19656.0| 86136.0| | PROCUREMENT| 22200.974859699276| 83278.24390243902| | CITY CLERK| 18981.993775936626| 69762.43902439025| | HUMAN RESOURCES| 24294.372080970592| 79851.76119402985| | DoIT| 16632.648172697624| 99681.02970297029| | OEMC| 19392.66507531968| 73153.77822115383| | PUBLIC LIBRARY| 19925.747703088775| 71273.28813559322| | MAYOR'S OFFICE| 42767.8111824964| 96165.5123076923| | INSPECTOR GEN| 23066.39814872611| 84030.66666666667| | COPA| 34338.260581121816| 98784.70588235294| | TRANSPORTN| 22692.692835886082| 89976.89606060603| |COMMUNITY DEVELOP...| 18267.126036122707| 88363.25714285714| | FIRE| 20235.504804337594| 97762.3486619425| | FINANCE| 24245.274276582662| 73276.36466165414| +--------------------+-------------------+------------------+