22
33import org .apache .log4j .Level ;
44import org .apache .log4j .Logger ;
5- import org .apache .spark .sql .Column ;
65import org .apache .spark .sql .Dataset ;
76import org .apache .spark .sql .Row ;
87import org .apache .spark .sql .SparkSession ;
98
109import static org .apache .spark .sql .functions .avg ;
10+ import static org .apache .spark .sql .functions .col ;
1111import static org .apache .spark .sql .functions .max ;
1212
1313public class StackOverFlowSurvey {
@@ -33,33 +33,33 @@ public static void main(String[] args) throws Exception {
3333 responses .show (20 );
3434
3535 System .out .println ("=== Print the so_region and self_identification columns of gender table ===" );
36- responses .select (new Column ("so_region" ), new Column ("self_identification" )).show ();
36+ responses .select (col ("so_region" ), col ("self_identification" )).show ();
3737
3838 System .out .println ("=== Print records where the response is from Afghanistan ===" );
39- responses .filter (new Column ("country" ).equalTo ("Afghanistan" )).show ();
39+ responses .filter (col ("country" ).equalTo ("Afghanistan" )).show ();
4040
4141 System .out .println ("=== Print the count of occupations ===" );
42- responses .groupBy (new Column ("occupation" )).count ().show ();
42+ responses .groupBy (col ("occupation" )).count ().show ();
4343
4444
4545 System .out .println ("=== Cast the salary mid point and age mid point to integer ===" );
46- Dataset <Row > castedResponse = responses .withColumn (SALARY_MIDPOINT , new Column (SALARY_MIDPOINT ).cast ("integer" ))
47- .withColumn (AGE_MIDPOINT , new Column (AGE_MIDPOINT ).cast ("integer" ));
46+ Dataset <Row > castedResponse = responses .withColumn (SALARY_MIDPOINT , col (SALARY_MIDPOINT ).cast ("integer" ))
47+ .withColumn (AGE_MIDPOINT , col (AGE_MIDPOINT ).cast ("integer" ));
4848
4949 System .out .println ("=== Print out casted schema ===" );
5050 castedResponse .printSchema ();
5151
5252 System .out .println ("=== Print records with average mid age less than 20 ===" );
53- castedResponse .filter (new Column (AGE_MIDPOINT ).$less (20 )).show ();
53+ castedResponse .filter (col (AGE_MIDPOINT ).$less (20 )).show ();
5454
5555 System .out .println ("=== Print the result with salary middle point in descending order ===" );
56- castedResponse .orderBy (new Column (SALARY_MIDPOINT ).desc ()).show ();
56+ castedResponse .orderBy (col (SALARY_MIDPOINT ).desc ()).show ();
5757
5858 System .out .println ("=== Group by country and aggregate by average salary middle point and max age middle point ===" );
5959 castedResponse .groupBy ("country" ).agg (avg (SALARY_MIDPOINT ), max (AGE_MIDPOINT )).show ();
6060
6161 System .out .println ("=== Group by salary bucket ===" );
62- Dataset <Row > responseWithSalaryBucket = castedResponse .withColumn (SALARY_MIDPOINT_BUCKET , new Column (SALARY_MIDPOINT ).divide (20000 ).cast ("integer" ).multiply (20000 ));
63- responseWithSalaryBucket .groupBy (SALARY_MIDPOINT_BUCKET ).count ().orderBy (new Column (SALARY_MIDPOINT_BUCKET )).show ();
62+ Dataset <Row > responseWithSalaryBucket = castedResponse .withColumn (SALARY_MIDPOINT_BUCKET , col (SALARY_MIDPOINT ).divide (20000 ).cast ("integer" ).multiply (20000 ));
63+ responseWithSalaryBucket .groupBy (SALARY_MIDPOINT_BUCKET ).count ().orderBy (col (SALARY_MIDPOINT_BUCKET )).show ();
6464 }
6565}
0 commit comments