Spark streaming manually saves offset to zk java implementation

Keywords: Spark kafka Apache Scala

Article directory

Preface

There are some cases on the Internet about setting offset manually in kafka, but most of them use 0.8 version of kafka, which is written by scala. The version of kafka-0.10 is rarely mentioned or incomplete. Version 0.10 is compatible with previous versions, but the new version of the api really needs to be more concise and easy to use. I can't find anything that can be used directly, so I take time to look at the official website api and write a more complete JAVA test case. I hope it will be helpful to you all.

pom Dependent Version

	<dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>2.2.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.11</artifactId>
      <version>2.2.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
      <version>2.2.0</version>
    </dependency>

Dome

Say little about the code. There are fewer comments, but code naming is a good comment.

public class WordCount {
    private static String groupId = "group1";

    private static Map<TopicPartition,Long> offsets = new HashMap<>();
    // Consumer Group Root Path
    private static Collection<String> topics = Arrays.asList( "mytest-topic","dc01");

    private static String zkServerUrls = "dc-sit-225:2181";

    private static ZkClient zkClient = new ZkClient(zkServerUrls);
    // Consumer Group Root Path
    private static String rootPath = "/consumers/offsets/" + groupId;

    public static void main(String[] args) throws InterruptedException {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("KafkaWordCount");
        JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
        initOffset(zkClient);
        JavaInputDStream<ConsumerRecord<String, String>> stream =
                KafkaUtils.createDirectStream(
                        jssc,
                        LocationStrategies.PreferConsistent(),
                        // Configuration theme, kafka parameters, offsets
                        ConsumerStrategies.<String, String>Subscribe(topics, initKafkaParams(),offsets)
                );
        JavaDStream<String> words = stream.flatMap(x -> updateOffset(x));
        // Count each word in each batch
        JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> new Tuple2<>(s, 1));
        JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey((i1, i2) -> i1 + i2);
        wordCounts.print();
        jssc.start();              // Start the computation
        jssc.awaitTermination();   // Wait for the computation to terminate
    }

    public static void initOffset(ZkClient zkClient){
        if (!zkClient.exists(rootPath)){
            for (String topic : topics){
                String path = rootPath + "/" + topic ;
                zkClient.createPersistent(path,true);
            }
        }else {
            List<String> topicSet = zkClient.getChildren(rootPath);
            for (String topic : topicSet){
                String topicPath = rootPath + "/" + topic;
                List<String> partitionSet = zkClient.getChildren(topicPath);
                for (String partition : partitionSet){
                    Long offset = zkClient.readData(topicPath + "/" + partition);
                    TopicPartition topicPartition = new TopicPartition(topic, Integer.valueOf(partition));
                    offsets.put(topicPartition,offset);
                }
            }
        }
    }

    public static Map initKafkaParams(){
        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "dc-sit-225:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", groupId);
        kafkaParams.put("auto.offset.reset", "earliest");
        kafkaParams.put("enable.auto.commit", false);
        return kafkaParams;
    }

    public static Iterator<String> updateOffset(ConsumerRecord<String,String> consumerRecord){
        TopicPartition topicPartition = new TopicPartition(consumerRecord.topic(), Integer.valueOf(consumerRecord.partition()));
        offsets.put(topicPartition,consumerRecord.offset());
        String path = rootPath + "/" + consumerRecord.topic() + "/" + consumerRecord.partition();
        if (!zkClient.exists(path)){
            zkClient.createPersistent(path,true);
        } else{
            zkClient.writeData(path, consumerRecord.offset());
        }
        return Arrays.asList(consumerRecord.value().split(" ")).iterator();
    }
}

Posted by artin on Tue, 08 Oct 2019 21:41:57 -0700