Spark Streaming is a quasi-real-time stream processing framework. The processing response time is usually in minutes, that is to say, the delay time of processing real-time data is in seconds; Storm is a real-time stream processing framework, and the processing response is in milliseconds. So the selection of flow framework depends on the specific business scenario. What needs to be clarified is that many people now think that Spark Streaming stream processing runs unstable, data loss, transactional support is not good, and so on, because many people can not control Spark Streaming and Spark itself. In terms of the latency of Spark Streaming stream processing, the Spark customized version will push the latency of Spark Streaming from second level to less than 100 milliseconds. The advantages of SparkStreaming are: 1. Provide a wealth of API s, enterprises can quickly implement a variety of complex business logic. 2. Data flow into Spark Streaming is combined with machine learning algorithm to complete machine simulation and graph calculation. 3. Spark Streaming is based on Spark's excellent lineage. Can SparkStreaming process data one by one like Storm? Storm processes data one by one, and Spark Streaming processes data based on unit time. Can Spark Streaming be like Storm? The answer is: Yes.
Here's a demo that reads data from kafka, traverses Rdd using foreach Rdd, and analyses it using sparksql conversion tables
package com.sprakStream.demo import java.util.regex.Matcher import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import com.sprakStream.bean.IpMapper import org.apache.spark.SparkConf import org.apache.spark.SparkContext import java.util.Properties import org.apache.spark.sql.SparkSession import com.sprakStream.util.CommUtil import java.sql.Connection import java.sql.PreparedStatement import java.sql.DriverManager import java.util.Arrays.ArrayList import java.util.ArrayList import java.util.Arrays.ArrayList import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructType import com.sprakStream.util.AppConstant import org.apache.spark.rdd.RDD import kafka.utils.Time import org.apache.spark.sql.SQLContext import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{ Time, Seconds, StreamingContext } import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel import org.apache.hadoop.record.Record import java.sql.Time object KafkaExcamle3 { def main(args: Array[String]): Unit = { //val conf = new SparkConf() //val sc = new SparkContext() // System.setProperty("spark.sql.warehouse.dir", "D:\\tools\\spark-2.0.0-bin-hadoop2.6"); // System.setProperty("hadoop.home.dir", "D:\\tools\\hadoop-2.6.0"); println("success to Init...") val url = "jdbc:postgresql://172.16.12.190:5432/dataex_tmp" val prop = new Properties() prop.put("user", "postgres") prop.put("password", "issing") val conf = new SparkConf().setAppName("wordcount").setMaster("local") val ssc = new StreamingContext(conf, Seconds(1)) val sparkSession = SparkSession.builder().config(conf).getOrCreate() val util = Utilities; util.setupLogging() // Construct a regular expression (regex) to extract fields from raw Apache log lines val pattern = util.apacheLogPattern() // hostname:port for Kafka brokers, not Zookeeper val kafkaParams = Map[String, Object]( "bootstrap.servers" -> AppConstant.KAFKA_HOST, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "example", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean)) // List of topics you want to listen for from Kafka val topics = List(AppConstant.KAFKA_TOPIC).toSet val lines = KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)).map(_.value()); val spiltWorks = lines.map(x => { val matcher: Matcher = pattern.matcher(x); if (matcher.matches()) matcher.group(0) }) val spiltDesc = spiltWorks.map { x => x.toString() }.window(Seconds(30), Seconds(2)) //Call the foreach RDD method to traverse the RDD in DStream spiltDesc.foreachRDD({ rdd => // Get the singleton instance of SQLContext println() println("=================================================Start your performance 111111111=================================================") println() val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) import sqlContext.implicits._ val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x => IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(), x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(), x(6).toString(), x(7).toString(), x(8).toString())).toDF() wordsDataFrame.registerTempTable("wordsDataFrame") val wordCountsDataFrame = sqlContext.sql("select * from wordsDataFrame") wordCountsDataFrame.show() }) //Call the foreach RDD method to traverse the RDD in DStream spiltWorks.foreachRDD({ rdd => // Get the singleton instance of SQLContext println() println("=================================================Start your performance at 2222222222222=================================================") println() val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) import sqlContext.implicits._ val wordsDataFrame = rdd.map(x => x.toString().split(" ")).map(x => IpMapper(CommUtil.uuid(), x(0).toString(), x(1).toString(), x(2).toString(), x(3).toString(), x(4).toString(), x(5).toString(), x(6).toString(), x(7).toString(), x(8).toString())).toDF() wordsDataFrame.registerTempTable("wordsDataFrame") val wordCountsDataFrame = sqlContext.sql("select * from wordsDataFrame") wordCountsDataFrame.show() }) // Kick it off ssc.checkpoint("/user/root/spark/checkpoint") ssc.start() ssc.awaitTermination() println("KafkaExample-End.................................") } } object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } }