The test code is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p1 import java.util.Properties import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SaveMode} /** * SparkSQL Various Actual Operations on Data Loading and Data Landing */ object _Zero 3SparkSQLLoadAndSaveOps { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf().setMaster("local[Two]").setAppName(_Zero 1SparkSQLOps.getClass.getSimpleName) val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // readOps(sqlContext) writeOps(sqlContext) sc.stop() } /** * Watch out for exceptions when writing results are in the directory * org.apache.spark.sql.AnalysisException: path file:/D:/data/spark/sql/people-1.json already exists * If you want to use the directory, you need to set up a specific save mode, SaveMode. * ErrorIfExist * By default, the directory exists, throwing exceptions * Append * Append * Ingore * Ignorance is equivalent to non-execution. * Overwrite * cover */ def writeOps(sqlContext:SQLContext): Unit = { val df = sqlContext.read.json("D:/data/spark/sql/people.json") df.registerTempTable("people") val retDF = sqlContext.sql("select * from people where age > 2 Zero") // retDF.show() // Put the results to the ground //retDF.coalesce(1).write.mode(SaveMode.Overwrite).json("D:/data/spark/sql/people-1.json") // Landing in database val url = "jdbc:mysql://localhost:3306/test" val table = "people1" // Will recreate a new table val properties = new Properties() properties.put("user", "root") properties.put("password", "root") retDF.coalesce(1).write.jdbc(url, table, properties) } /* // sparkSQL Read data // java.lang.RuntimeException: file:/D:/data/spark/sql/people.json is not a Parquet file sparkSQL The default file format loaded with read.load is parquet (parquet.apache.org) What about loading other file formats? You need to specify the format of the load file. format("json") */ def readOps(sqlContext:SQLContext): Unit = { // val df = sqlContext.read.load("D:/data/spark/sql/users.parquet") // val df = sqlContext.read.format("json").load("D:/data/spark/sql/people.json") // val df = sqlContext.read.json("D:/data/spark/sql/people.json") val url = "jdbc:mysql://localhost:3306/test" val table = "people" val properties = new Properties() properties.put("user", "root") properties.put("password", "root") val df = sqlContext.read.jdbc(url, table, properties) df.show() } }
When the read operation is performed, the output results are as follows:
+---+----+---+------+ | id|name|age|height| +---+----+---+------+ | 1 | Little Sweet | 18 | 168.0| | 2 | Little Dandane | 19 | 167.0| | 3 | God | 25 | 181.0| | 4 | Head of Regiment | 38 | 158.0| | 5 | Journalist | 22 | 169.0| +---+----+---+------+
When a write operation is performed:
1. If saved to json file Note that there are various write modes, and in addition, it saves a directory in HDFS-compatible directory format. 2. If saved to jdbc A table containing columns in the DataFrame is created in the database, noting that the table does not exist
Integration of Spark SQL and Hive
You need to start Hive first, and then do the following.
Code writing
The test code is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p2 import cn.xpleaf.bigdata.spark.scala.sql.p1._01SparkSQLOps import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext /** * Manipulating the data of tables in Hive by creating HiveContext * Data source: * teacher_info.txt * name(String) height(double) * zhangsan,One hundred and seventy-five * lisi,One hundred and eighty * wangwu,One hundred and seventy-five * zhaoliu,One hundred and ninety-five * zhouqi,One hundred and sixty-five * weiba,One hundred and eighty-five * * create table teacher_info( * name string, * height double * ) row format delimited * fields terminated by ','; * * teacher_basic.txt * name(String) age(int) married(boolean) children(int) * zhangsan,23,false,0 * lisi,24,false,0 * wangwu,25,false,0 * zhaoliu,26,true,1 * zhouqi,27,true,2 * weiba,28,true,3 * * create table teacher_basic( * name string, * age int, * married boolean, * children int * ) row format delimited * fields terminated by ','; * * * Demand: *1.Create corresponding tables in hive by sparkSQL and load data into corresponding tables *2.Execute spark SQL jobs, calculate the association information between teacher_info and teacher_basic, and store the results in a table teacher * * When performing hive operations in a cluster, the following configuration is required: * 1,Copy hive-site.xml to the spark/conf directory and mysql connector to the spark/lib directory 2,Add a record to $SPARK_HOME/conf/spark-env.sh export SPARK_CLASSPATH=$SPARK_CLASSPATH:$SPARK_HOME/lib/mysql-connector-java-5.1.39.jar */ object _01HiveContextOps { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf() // .setMaster("local[2]") .setAppName(_01SparkSQLOps.getClass.getSimpleName) val sc = new SparkContext(conf) val hiveContext = new HiveContext(sc) //Create the teacher_info table hiveContext.sql("CREATE TABLE teacher_info(" + "name string, " + "height double) " + "ROW FORMAT DELIMITED " + "FIELDS TERMINATED BY ','") hiveContext.sql("CREATE TABLE teacher_basic(" + "name string, " + "age int, " + " married boolean, " + "children int) " + "ROW FORMAT DELIMITED " + "FIELDS TERMINATED BY ','") // Loading data to tables hiveContext.sql("LOAD DATA LOCAL INPATH '/home/uplooking/data/hive/sql/teacher_info.txt' INTO TABLE teacher_info") hiveContext.sql("LOAD DATA LOCAL INPATH '/home/uplooking/data/hive/sql/teacher_basic.txt' INTO TABLE teacher_basic") //The second step is to calculate the associated data of the two tables. val joinDF = hiveContext.sql("SELECT " + "b.name, " + "b.age, " + "if(b.married, 'married', 'unmarried') as married, " + "b.children, " + "i.height " + "FROM teacher_info i " + "INNER JOIN teacher_basic b ON i.name = b.name") joinDF.collect().foreach(println) joinDF.write.saveAsTable("teacher") sc.stop() } }
Packaging, uploading and configuration
After packaging, upload to the cluster environment, and then configure Spark as follows:
When performing hive operations in a cluster, the following configuration is required: 1. Copy hive-site.xml to the spark/conf directory and mysql connector to the spark/lib directory. 2. Add a record to $SPARK_HOME/conf/spark-env.sh export SPARK_CLASSPATH=$SPARK_CLASSPATH:$SPARK_HOME/lib/mysql-connector-java-5.1.39.jar
Submit spark jobs
The scripts used to submit jobs in spark are as follows:
[uplooking@uplooking01 spark]$ cat spark-submit-standalone.sh #export HADOOP_CONF_DIR=/home/uplooking/app/hadoop/etc/hadoop /home/uplooking/app/spark/bin/spark-submit \ --class $2 \ --master spark://uplooking02:7077 \ --executor-memory 1G \ --num-executors 1 \ $1 \
Execute the following commands:
./spark-submit-standalone.sh spark-hive.jar cn.xpleaf.bigdata.spark.scala.sql.p2._01HiveContextOps
Verification
We can see the expected output from the output of job execution, or we can operate directly in Hive to verify:
hive> show tables; OK hpeople people t1 teacher teacher_basic teacher_info Time taken: 0.03 seconds, Fetched: 6 row(s) hive> select * from teacher; OK zhangsan 23 unmarried 0 175.0 lisi 24 unmarried 0 180.0 wangwu 25 unmarried 0 175.0 zhaoliu 26 married 1 195.0 zhouqi 27 married 2 165.0 weiba 28 married 3 185.0 Time taken: 0.369 seconds, Fetched: 6 row(s)
Integration of Spark and ES
You need to make sure that the Elastic Search environment is built.
The test code is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p2 import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.elasticsearch.spark.sql._ import org.elasticsearch.spark._ /** * Spark Integrated operation with ES * Introducing maven dependencies of Spark and es * elasticsearch-hadoop * 2.3.0 * Load account.json into spark/account, the index library of es * Reference to official documentation: https://www.elastic.co/guide/en/elastic search/hadoop/2.3/spark.html */ object _02SparkElasticSearchOps { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf() .setAppName(_02SparkElasticSearchOps.getClass().getSimpleName) .setMaster("local[2]") /** * Spark Integrated configuration with es */ conf.set("es.index.auto.create", "true") conf.set("es.nodes", "uplooking01") conf.set("es.port", "9200") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // write2ES(sqlContext) readFromES(sc) sc.stop() } /** * Reading data from es * (Use sparkContext to operate) */ def readFromES(sc:SparkContext): Unit = { val resources = "spark/account" // Index libraries/types val jsonRDD = sc.esJsonRDD(resources) jsonRDD.foreach(println) } /** * Write data to es * (Use sqlContext to operate) */ def write2ES(sqlContext:SQLContext): Unit = { val jsonDF = sqlContext.read.json("D:/data/spark/sql/account.json") val resources = "spark/account" // Index libraries/types jsonDF.saveToEs(resources) } }
Spark SQL function
Overview (Spark 1.5.X ~ 1.6.X built-in functions)
Using built-in functions in Spark SQL to analyze data, Spark SQL API is different. The result of operation of built-in functions in DataFrame is to return a Column object. DataFrame is naturally "A distributed collection of data organized into named columns." This provides a solid foundation for complex analysis of data and provides great convenience, for example, I said. They can call built-in functions at any time to process business needs in the method of operating DataFrame, which can greatly reduce unnecessary time consumption (based on the mapping of the actual model) for building the business logic of attachments. Let's focus on data analysis, which is very valuable for improving engineers'productivity. Spark 1.5.x opens. Starting with a large number of built-in functions, there are max, mean, min, sum, avg, explode, size, sort_array, day, to_date, abs, acos, asin, atan
Overall, built-in functions contain five basic types:
1. Aggregation functions, such as countDistinct, sumDistinct, etc. 2. Set functions, such as sort_array, explode, etc. 3. Date and time functions, such as hour, quarter, next_day 4. Mathematical functions, such as asin, atan, sqrt, tan, round, etc. 5. Window-opening functions, such as rowNumber, etc. 6. String function, concat, format_number, rexexp_extract 7. Other functions, isNaN, sha, randn, callUDF The following is the knowledge content in Hive, but obviously Spark SQL has the same concept. UDF User-defined function: User Definded Function One input, one output a--->A strlen("adbad")=5 UDAF User-defined aggregation function: User Definded Aggregation Function Multiple Input and One Output Sum (a, b, c, d) --> Summary results Table function UDTF: User Definded Table Function Multiplex Input, Multiplex Output "hello you" "Hello me" - - - > conversion operation - -- - > split (") - - > Array [] ["hello, "you"]---> "hello" "you" --> Row-column Conversion
A basic case is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p2 import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext /** * SparkSQL Built-in function operation */ object _03SparkSQLFunctionOps { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf() .setAppName(_03SparkSQLFunctionOps.getClass().getSimpleName) .setMaster("local[2]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val pdf = sqlContext.read.json("D:/data/spark/sql/people.json") pdf.show() pdf.registerTempTable("people") // Statistical population sqlContext.sql("select count(1) from people").show() // Statistical Minimum Age sqlContext.sql("select age, " + "max(age) as max_age, " + "min(age) as min_age, " + "avg(age) as avg_age, " + "count(age) as count " + "from people group by age order by age desc").show() sc.stop() } }
The output results are as follows:
+---+------+-------+ |age|height| name| +---+------+-------+ | 10| 168.8|Michael| | 30| 168.8| Andy| | 19| 169.8| Justin| | 32| 188.8| Jack| | 10| 158.8| John| | 19| 179.8| Domu| | 13| 179.8| Yuan Shuai| | 30| 175.8| Yin Jie| | 19| 179.9| Sun Rui| +---+------+-------+ 18/05/09 17:53:23 INFO FileInputFormat: Total input paths to process : 1 +---+ |_c0| +---+ | 9| +---+ 18/05/09 17:53:24 INFO FileInputFormat: Total input paths to process : 1 +---+-------+-------+-------+-----+ |age|max_age|min_age|avg_age|count| +---+-------+-------+-------+-----+ | 32| 32| 32| 32.0| 1| | 30| 30| 30| 30.0| 2| | 19| 19| 19| 19.0| 3| | 13| 13| 13| 13.0| 1| | 10| 10| 10| 10.0| 2| +---+-------+-------+-------+-----+
Spark SQL Window Opening Function
1. After Spark version 1.5.x, windows are introduced into Spark SQL and DataFrame, such as our row_number(), which enables us to implement the logic of grouping topn.
2. Make a case to get the value of topn (using the window function of Spark). I don't know if the students still have an impression. We did the calculation of topn in the earliest time, which was very troublesome at that time. But now with Spark SQL, it's very convenient.
UDF Operation of Spark SQL
The test code is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p2 import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.{SparkConf, SparkContext} /** * SparkSQL Built-in function operation */ object _04SparkSQLFunctionOps { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf() .setAppName(_04SparkSQLFunctionOps.getClass().getSimpleName) .setMaster("local[2]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) /** * hive User-defined function UDF operation in Spark SQL (i.e. analogy with hive, because both hive and Spark SQL are interactive calculations) * 1.Create a normal function * 2.Registration (registered in SqlContext) * 3.It can be used directly. * * Case: Create a udf to get the length of a string */ // 1. Create a normal function def strLen(str:String):Int = str.length // 2. Registration (registration in SqlContext) sqlContext.udf.register[Int, String]("myStrLen", strLen) val list = List("Hello you", "Hello he", "Hello me") // Converting RDD to DataFrame val rowRDD = sqlContext.sparkContext.parallelize(list).flatMap(_.split(" ")).map(word => { Row(word) }) val scheme = StructType(List( StructField("word", DataTypes.StringType, false) )) val df = sqlContext.createDataFrame(rowRDD, scheme) df.registerTempTable("test") // 3. Use it directly. sqlContext.sql("select word, myStrLen(word) from test").show() sc.stop() } }
The output results are as follows:
+-----+---+ | word|_c1| +-----+---+ |Hello| 5| | you| 3| |Hello| 5| | he| 2| |Hello| 5| | me| 2| +-----+---+
wordcount operation of Spark SQL
The test code is as follows:
package cn.xpleaf.bigdata.spark.scala.sql.p2 import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{Row, SQLContext} /** * Both parts are more important: * 1.Use SparkSQL to complete word statistics * 2.Use of windowing functions */ object _05SparkSQLFunctionOps2 { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf() .setAppName(_05SparkSQLFunctionOps2.getClass().getSimpleName) .setMaster("local[2]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) val list = List("Hello you", "Hello he", "Hello me") // Converting RDD to DataFrame val rowRDD = sqlContext.sparkContext.parallelize(list).map(line => { Row(line) }) val scheme = StructType(List( StructField("line", DataTypes.StringType, false) )) val df = sqlContext.createDataFrame(rowRDD, scheme) df.registerTempTable("test") df.show() // Execute wordcount val sql = "select t.word, count(1) as count " + "from " + "(select " + "explode(split(line, ' ')) as word " + "from test) as t " + "group by t.word order by count desc" sqlContext.sql(sql).show() sc.stop() } }
The output results are as follows:
+---------+ | line| +---------+ |Hello you| | Hello he| | Hello me| +---------+ +-----+-----+ | word|count| +-----+-----+ |Hello| 3| | me| 1| | he| 1| | you| 1| +-----+-----+