I've got a bunch of Spark questions again, this time SparkStreaming!!!
Say nothing more, Top!!!
The title is as follows_
Following is an apology for the failure of the RNG S8 8 Final Competition on Weibo
Data description:
Data in rng_comment.txt file
field | Meaning |
---|---|
index | Data id |
child_comment | Number of replies |
comment_time | Comment Time |
content | Comments |
da_v | Microblog Personal Certification |
like_status | Fabulous |
pic | Picture comment url |
user_id | Weibo user id |
user_name | Microblog User Name |
vip_rank | Weibo Membership Level |
stamp | time stamp |
1.1. Create an rng_comment theme in kafka with two partitions and two copies
1.2. Data preprocessing to filter out empty rows
1.3. Please write the given file to kafka, partition according to data id, send an odd number to one partition, even number to another partition
1.4. Compute after docking kafka with Spark Streaming
- Create a database rng_comment in mysql
- Create a vip_rank table in the database rng_comment with fields that are all fields of the data
- Create the like_status table in the database rng_comment, with fields for all fields of the data
- Create count_conmment table in database rng_comment with time, number of entries
1.4.1, query out users with a 5-membership level on Weibo, and write these data to the vip_rank table in the mysql database
1.4.2, Query data for more than 10 reviews and write it to the like_status table in the mysql database
1.4.3, Calculate the number of comments per day for the four days 2018/10/20, 2018/10/21, 2018/10/22, 2018/10/23, and write them to the count_conmment table in the mysql database
The data is as follows_
The data is too big, so put it on Baidu cloud, invalid please trust the blogger privately! Link: https://pan.baidu.com/s/1jMs JbN9RLh5ItXFDXdwVjw Extraction Code: 1234
The title is as follows_
Data preprocessing to remove empty rows
object HomeWork20200414_1_Pretreatment { def main(args: Array[String]): Unit = { val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("HomeWork20200414_1_Pretreatment") val sc = new SparkContext(sparkConf) sc.setLogLevel("WARN") val fileRDD: RDD[String] = sc.textFile("input20200414/rng_comment.txt") fileRDD.filter{x => var datas = x.split("\t");datas.length == 11}.coalesce(1).saveAsTextFile("output20200414") sc.stop() } }
Read data write to Kafka
public class HomeWork20200414_2_Producer { public static void main(String[] args) throws IOException { Properties props = new Properties(); props.put("bootstrap.servers", "192.168.100.111:9092,192.168.100.112:9092,192.168.100.113:9092"); props.put("acks", "-1"); props.put("retries", 0); props.put("batch.size", 16384); props.put("linger.ms", 1); props.put("buffer.memory", 33554432); props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); Producer<String, String> producer = new KafkaProducer<>(props); File inputFile = new File("output20200414/part-00000"); BufferedReader bufferedReader = new BufferedReader(new FileReader(inputFile)); String line = null; int partition = 0; while ((line = bufferedReader.readLine()) != null) { try { if(Integer.parseInt(line.split("\t")[0]) % 2== 0){ partition = 0; }else{ partition = 1; } }catch (NumberFormatException e){ continue; } producer.send(new ProducerRecord<String, String>("rng_comment",partition,String.valueOf(partition),line)); } bufferedReader.close(); producer.close(); } }
SparkStreaming consumes Kafka data and writes it to MySql (I use RDD to write data to MySQL, or I can convert it to DataFream and write it to MySQL using SparkSQL, which I will add later!)
object HomeWork20200414_3_SparkStreaming { val driver = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://localhost:3306/rng_comment" val username = "root" val password = "root" def main(args: Array[String]): Unit = { /** * 1.5.1,Query out users with a 5-membership level on Weibo and write the data to the vip_rank table in the mysql database * 1.5.2,Query out more than 10 reviews and write them to the like_status table in the mysql database * 1.5.3,Calculate the number of comments per day for the four days 2018/10/20, 2018/10/21, 2018/10/22, 2018/10/23 and write them to the count_conmment table in the mysql database */ val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("HomeWork20200414_3_SparkStreaming") val ssc = new StreamingContext(sparkConf, Seconds(3)) ssc.sparkContext.setLogLevel("WARN") // 3. Setting Kafka parameters val kafkaParams: Map[String, Object] = Map[String, Object]( ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.100.111:9092,192.168.100.112:9092,192.168.100.113:9092", ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], ConsumerConfig.GROUP_ID_CONFIG -> "SparkKafka77777", ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest", //false means to turn off automatic submission. spark helps you submit to Checkpoint or programmer's manual maintenance ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean) ) // 4. Set Topic var topics = Array("rng_comment") val recordDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, //Location policy, which is strongly recommended by source code, will make the Executor of Spark and the Broker of Kafka correspond evenly ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)) //Consumption policy, strongly recommended by source val resultDStream: DStream[Array[String]] = recordDStream.map(_.value()).map(_.split("\t")).cache() // 1. Query out users with a 5-membership level on Weibo and write the data to the vip_rank table in the mysql database resultDStream.filter(_ (9) == "5").foreachRDD { rdd: RDD[Array[String]] => { rdd.foreachPartition { iter: Iterator[Array[String]] => { Class.forName(driver) val connection: Connection = DriverManager.getConnection(url, username, password) var sql = "insert into vip_rank values (?,?,?,?,?,?,?,?,?,?,?)" iter.foreach { line: Array[String] => { val statement: PreparedStatement = connection.prepareStatement(sql) statement.setInt(1, line(0).toInt); statement.setInt(2, line(1).toInt); statement.setString(3, line(2)); statement.setString(4, line(3)); statement.setString(5, line(4)); statement.setString(6, line(5)); statement.setString(7, line(6)); statement.setString(8, line(7)); statement.setString(9, line(8)); statement.setInt(10, line(9).toInt); statement.setString(11, line(10)); statement.executeUpdate() statement.close() } } connection.close() } } } } // 2. Query over 10 reviews and write them to the like_status table in the mysql database resultDStream.filter(_ (5).toInt > 10).foreachRDD { rdd: RDD[Array[String]] => { rdd.foreachPartition { iter: Iterator[Array[String]] => { Class.forName(driver) val connection: Connection = DriverManager.getConnection(url, username, password) var sql = "insert into like_status values (?,?,?,?,?,?,?,?,?,?,?)" iter.foreach { line: Array[String] => { val statement: PreparedStatement = connection.prepareStatement(sql) statement.setInt(1, line(0).toInt); statement.setInt(2, line(1).toInt); statement.setString(3, line(2)); statement.setString(4, line(3)); statement.setString(5, line(4)); statement.setString(6, line(5)); statement.setString(7, line(6)); statement.setString(8, line(7)); statement.setString(9, line(8)); statement.setInt(10, line(9).toInt); statement.setString(11, line(10)); statement.executeUpdate() statement.close() } } connection.close() } } } } val dateFormat1 = new SimpleDateFormat("yyyy/MM/dd HH:mm") val dateFormat2 = new SimpleDateFormat("yyyy/MM/dd") // 3. Calculate the number of comments per day for the four days 2018/10/20, 2018/10/21, 2018/10/22, 2018/10/23 and write them to the count_conmment table in the mysql database val value: DStream[Array[String]] = resultDStream.filter { date:Array[String] => { val str: String = dateFormat2.format(dateFormat1.parse(date(2))) if ("2018/10/20".equals(str) || "2018/10/21".equals(str) || "2018/10/22".equals(str) || "2018/10/23".equals(str)) { true } else { false } } } value.foreachRDD { rdd: RDD[Array[String]] => { rdd.groupBy(x => dateFormat2.format(dateFormat1.parse(x(2)))).map(x => x._1 -> x._2.size).foreachPartition { iter: Iterator[(String, Int)] => { Class.forName(driver) val connection: Connection = DriverManager.getConnection(url, username, password) var sql = "insert into count_conmment values (?,?)" iter.foreach { line: (String, Int) => { val statement: PreparedStatement = connection.prepareStatement(sql) statement.setString(1, line._1); statement.setInt(2, line._2.toInt); statement.executeUpdate() statement.close() } } connection.close() } } } } ssc.start() ssc.awaitTermination() ssc.stop } }