spark examples

Keywords: Spark Apache SQL Java

Spark Streaming is a quasi-real-time stream processing framework. The processing response time is usually in minutes, that is to say, the delay time of processing real-time data is in seconds; Storm is a real-time stream processing framework, and the processing response is in milliseconds. So the selection of flow framework depends on the specific business scenario. What needs to be clarified is that many people now think that Spark Streaming stream processing runs unstable, data loss, transactional support is not good, and so on, because many people can not control Spark Streaming and Spark itself. In terms of the latency of Spark Streaming stream processing, the Spark customized version will push the latency of Spark Streaming from second level to less than 100 milliseconds.
The advantages of SparkStreaming are:
1. Provide a wealth of API s, enterprises can quickly implement a variety of complex business logic.
2. Data flow into Spark Streaming is combined with machine learning algorithm to complete machine simulation and graph calculation.
3. Spark Streaming is based on Spark's excellent lineage.
Can SparkStreaming process data one by one like Storm?
Storm processes data one by one, and Spark Streaming processes data based on unit time. Can Spark Streaming be like Storm? The answer is: Yes.
Here's a demo that reads data from kafka, traverses Rdd using foreach Rdd, and analyses it using sparksql conversion tables
package com.sprakStream.demo
import java.util.regex.Matcher
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import com.sprakStream.bean.IpMapper
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import java.util.Properties
import org.apache.spark.sql.SparkSession
import com.sprakStream.util.CommUtil
import java.sql.Connection
import java.sql.PreparedStatement
import java.sql.DriverManager
import java.util.Arrays.ArrayList
import java.util.ArrayList
import java.util.Arrays.ArrayList
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructType
import com.sprakStream.util.AppConstant
import org.apache.spark.rdd.RDD
import kafka.utils.Time
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{ Time, Seconds, StreamingContext }
import org.apache.spark.util.IntParam
import org.apache.spark.sql.SQLContext
import org.apache.spark.storage.StorageLevel
import org.apache.hadoop.record.Record
import java.sql.Time

object KafkaExcamle3 {

  def main(args: Array[String]): Unit = {

    //val conf = new SparkConf()
    //val sc = new SparkContext()
    //    System.setProperty("spark.sql.warehouse.dir", "D:\\tools\\spark-2.0.0-bin-hadoop2.6");
    //    System.setProperty("hadoop.home.dir", "D:\\tools\\hadoop-2.6.0");
    println("success to Init...")
    val url = "jdbc:postgresql://172.16.12.190:5432/dataex_tmp"
    val prop = new Properties()
    prop.put("user", "postgres")
    prop.put("password", "issing")

    val conf = new SparkConf().setAppName("wordcount").setMaster("local")
    val ssc = new StreamingContext(conf, Seconds(1))
    val sparkSession = SparkSession.builder().config(conf).getOrCreate()
    val util = Utilities;
    util.setupLogging()
    // Construct a regular expression (regex) to extract fields from raw Apache log lines  
    val pattern = util.apacheLogPattern()
    // hostname:port for Kafka brokers, not Zookeeper  
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> AppConstant.KAFKA_HOST,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "example",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean))
    // List of topics you want to listen for from Kafka  
    val topics = List(AppConstant.KAFKA_TOPIC).toSet
    val lines = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)).map(_.value());

    val spiltWorks = lines.map(x => { val matcher: Matcher = pattern.matcher(x); if (matcher.matches()) matcher.group(0) })
    val spiltDesc = spiltWorks.map { x => x.toString() }.window(Seconds(30), Seconds(2))

    //Call the foreach RDD method to traverse the RDD in DStream
    spiltDesc.foreachRDD({
      rdd =>
        // Get the singleton instance of SQLContext
        println()
        println("=================================================Start your performance 111111111=================================================")
        println()
        val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
        import sqlContext.implicits._
        val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x =>
          IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(),
            x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(),
            x(6).toString(), x(7).toString(), x(8).toString())).toDF()
        wordsDataFrame.registerTempTable("wordsDataFrame")
        val wordCountsDataFrame =
          sqlContext.sql("select * from wordsDataFrame")
        wordCountsDataFrame.show()
    })

    //Call the foreach RDD method to traverse the RDD in DStream
    spiltWorks.foreachRDD({
      rdd =>
        // Get the singleton instance of SQLContext
        println()
        println("=================================================Start your performance at 2222222222222=================================================")
        println()
        val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
        import sqlContext.implicits._
        val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x =>
          IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(),
            x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(),
            x(6).toString(), x(7).toString(), x(8).toString())).toDF()
        wordsDataFrame.registerTempTable("wordsDataFrame")
        val wordCountsDataFrame =
          sqlContext.sql("select * from wordsDataFrame")
        wordCountsDataFrame.show()
    })

    // Kick it off  
    ssc.checkpoint("/user/root/spark/checkpoint")
    ssc.start()
    ssc.awaitTermination()
    println("KafkaExample-End.................................")
  }

}
object SQLContextSingleton {

  @transient private var instance: SQLContext = _

  def getInstance(sparkContext: SparkContext): SQLContext = {
    if (instance == null) {
      instance = new SQLContext(sparkContext)
    }
    instance
  }
}




Posted by Frederick on Fri, 21 Jun 2019 14:37:32 -0700