add HousePriceProblem and HousePriceSolution

James Lee · James Lee · commit 716cde386c1a · 2017-02-12T17:52:01.000Z
diff --git a/in/RealEstate.csv b/in/RealEstate.csv
@@ -1,4 +1,4 @@
-MLS,Location,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Status
+MLS,Location,Price,Bedrooms,Bathrooms,Size,Price SQ Ft,Status
 132842,Arroyo Grande,795000.00,3,3,2371,335.30,Short Sale
 134364,Paso Robles,399000.00,4,3,2818,141.59,Short Sale
 135141,Paso Robles,545000.00,4,3,3032,179.75,Short Sale
diff --git a/src/main/java/com/sparkTutorial/sparkSql/HousePriceProblem.java b/src/main/java/com/sparkTutorial/sparkSql/HousePriceProblem.java
@@ -0,0 +1,41 @@
+package com.sparkTutorial.sparkSql;
+
+
+public class HousePriceProblem {
+
+        /* TODO: Create a Spark program to read the house data from in/RealEstate.csv, group by location, aggregate the average price per SQ Ft and max price, and sort by average price per SQ Ft.
+
+        The HOUSES dataset contains a collection of recent real estate listings in San Luis Obispo county and
+        around it. The dataset is provided in two formats: as a CSV file and as a Microsoft Excel (1997­2003)
+        spreadsheet.
+
+        The dataset contains the following fields:
+        1. MLS: Multiple listing service number for the house (unique ID).
+        2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
+        northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there
+        some out of area locations as well.
+        3. Price: the most recent listing price of the house (in dollars).
+        4. Bedrooms: number of bedrooms.
+        5. Bathrooms: number of bathrooms.
+        6. Size: size of the house in square feet.
+        7. Price/SQ.ft: price of the house per square foot.
+        8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
+
+        Each field is comma separated.
+
+        Sample output:
+
+        +----------------+-----------------+----------+
+        |        Location| avg(Price SQ Ft)|max(Price)|
+        +----------------+-----------------+----------+
+        |          Oceano|           1145.0|   1195000|
+        |         Bradley|            606.0|   1600000|
+        | San Luis Obispo|            459.0|   2369000|
+        |      Santa Ynez|            391.4|   1395000|
+        |         Cayucos|            387.0|   1500000|
+        |.............................................|
+        |.............................................|
+        |.............................................|
+
+         */
+}
diff --git a/src/main/java/com/sparkTutorial/sparkSql/HousePriceSolution.java b/src/main/java/com/sparkTutorial/sparkSql/HousePriceSolution.java
@@ -0,0 +1,33 @@
+package com.sparkTutorial.sparkSql;
+
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.spark.sql.Column;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+
+import static org.apache.spark.sql.functions.avg;
+import static org.apache.spark.sql.functions.max;
+
+public class HousePriceSolution {
+
+    private static final String PRICE = "Price";
+    private static final String PRICE_SQ_FT = "Price SQ Ft";
+
+    public static void main(String[] args) throws Exception {
+
+        Logger.getLogger("org").setLevel(Level.ERROR);
+        SparkSession session = SparkSession.builder().appName("HousePriceSolution").master("local[1]").getOrCreate();
+
+        Dataset<Row> realEstate = session.read().option("header", "true").csv("in/RealEstate.csv");
+
+        Dataset<Row> castedRealEstate = realEstate.withColumn(PRICE, new Column(PRICE).cast("long")).withColumn(PRICE_SQ_FT, new Column(PRICE_SQ_FT).cast("long"));
+
+        castedRealEstate.groupBy("Location")
+                        .agg(avg(PRICE_SQ_FT), max(PRICE))
+                        .orderBy(new Column("avg(" + PRICE_SQ_FT + ")").desc())
+                        .show();
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-MLS,Location,Price,Bedrooms,Bathrooms,Size,Price/SQ.Ft,Status`
	`1`	`+MLS,Location,Price,Bedrooms,Bathrooms,Size,Price SQ Ft,Status`
`2`	`2`	`132842,Arroyo Grande,795000.00,3,3,2371,335.30,Short Sale`
`3`	`3`	`134364,Paso Robles,399000.00,4,3,2818,141.59,Short Sale`
`4`	`4`	`135141,Paso Robles,545000.00,4,3,3032,179.75,Short Sale`