java implementation of spark streaming and kafka integration for flow computing
- Added on June 26, 2017: took over the search system, and in the past half year, I have gained a lot of new experience. I'm lazy to change this vulgar article. Let's take a look at this new blog to understand the following vulgar code, http://blog.csdn.net/yujishi2/article/details/73849237.
- Background: there are many articles about spark streaming on the Internet, but most of them are implemented in scala. Because our e-commerce real-time recommendation project is mainly Java, we stepped on some holes and wrote the implementation of java version. The code is more stream of consciousness, light spray, welcome to discuss.
- Process: spark streaming reads the real-time click data of users from kafka, reads the similarity matrix of goods from redis after filtering the data, reads the historical behavior of users from db, calculates the degree of interest in real time, and writes the results into a copy of redis for the api layer to read and display, and writes into a copy of hdfs for offline calculation of the accuracy recall rate.
- Supplement: it is understood that in large real-time recommendation system, collaborative filtering is generally used to generate candidate sets, computing interest reading will be replaced by rerank of ctr and other strategies, and online rerank service sorting is called in calculateinterest.
-
12 / 13 added: the recall remains unchanged. At present, ctr prediction plus rule ordering is used, followed by ltr.
-
Less nonsense, code:
public class Main { static final String ZK_QUORUM = "*.*.*.*:2181,*.*.*.*:2181,*.*.*.*:2181/kafka"; static final String GROUP = "test-consumer-group"; static final String TOPICSS = "user_trace"; static final String NUM_THREAD = "64"; public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("main.java.computingCenter"); // Create the context with 2 seconds batch size //Read kafka every two seconds JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); int numThreads = Integer.parseInt(NUM_THREAD); Map<String, Integer> topicMap = new HashMap<String, Integer>(); String[] topics = TOPICSS.split(","); for (String topic: topics) { topicMap.put(topic, numThreads); } JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, ZK_QUORUM, GROUP, topicMap); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { public Iterable<String> call(String lines) { //kafka data format: "{" topic \ ": \" user_trace \ "," partitionkey \ ": \" 0 \ "," timestamp \ ": 1471524044018," "data \": \ "0 = 16367058905171371918% 3a19684617823830207 \", "logid \": \ "0 \", "contenttype \": \ "application / x-www-form-urlencoded \"} "; List<String> arr = new ArrayList<String>(); for (String s : lines.split(" ")) { Map j = JSON.parseObject(s); String s1 = ""; String s2 = ""; try { s1 = URLDecoder.decode(j.get("Data").toString(), "UTF-8"); s2 = s1.split("=")[1]; } catch (UnsupportedEncodingException e) { e.printStackTrace(); } arr.add(s2); } return arr; } }); JavaPairDStream<String, String> goodsSimilarityLists = words.filter(new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { //Filtering illegal data if (s.split(":").length == 2) { return true; } return false; } }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, String>() { //This disposition partition processes each pair @Override public Iterable<Tuple2<String, String>> call(Iterator<String> s) throws Exception { ArrayList<Tuple2<String, String>> result = new ArrayList<Tuple2<String, String>>(); while (s.hasNext()) { String x = s.next(); String userId = x.split(":")[0]; String goodsId = x.split(":")[1]; System.out.println(x); LinkedHashMap<Long, Double> recommendMap = null; try { //This service reads data from redis, calculates real-time interest, and writes recommended results to redis for use by api layer CalculateInterestService calculateInterestService = new CalculateInterestService(); try { recommendMap = calculateInterestService.calculateInterest(userId, goodsId); } catch (Exception e) { e.printStackTrace(); } String text = ""; int count = 0; for (Map.Entry<Long, Double> entry : recommendMap.entrySet()) { text = text + entry.getKey(); if (count == recommendMap.size() - 1) { break; } count = count + 1; text = text + "{/c}"; } text = System.currentTimeMillis() + ":" + text; result.add(new Tuple2<String, String>(userId, text)); } catch (Exception e) { e.printStackTrace(); } } return result; } }); goodsSimilarityLists.foreachRDD(new Function<JavaPairRDD<String, String>, Void>() { @Override public Void call(JavaPairRDD<String, String> rdd) throws Exception { //Print rdd, easy to debug System.out.println(rdd.collect()); return null; } }); JavaPairDStream<Text, Text> goodsSimilarityListsText = goodsSimilarityLists.mapToPair(new PairFunction<Tuple2<String, String>, Text, Text>(){ @Override public Tuple2<Text, Text> call(Tuple2<String, String> ori) throws Exception { //Here, to convert tuple2 to org.apache.hadoop.io.Text format, use the saveAsHadoopFiles method to write to hdfs return new Tuple2(new Text(ori._1), new Text(ori._2)); } }); //Write hdfs goodsSimilarityListsText.saveAsHadoopFiles("/user/hadoop/recommend_list/rl", "123", Text.class, Text.class, SequenceFileOutputFormat.class); jssc.start(); jssc.awaitTermination(); } }
public class CalculateInterestService { private String dictKey = "greate_item_sim_2.0"; private String recommendTable = "great_recommend_table_2.0"; static final String HIGO_BASE_URL = "jdbc:mysql://*.*.*.*:3212/*"; static final String HIGO_BASE_USER = "*"; static final String HIGO_BASE_PASS = "*"; public LinkedHashMap<Long, Double> calculateInterest(String userId, String traceGoodsId) { LinkedHashMap<Long, Double> sortedMap = new LinkedHashMap<Long, Double>(); String[] simGoods = RedisHelper.getInstance().hget(dictKey, traceGoodsId).split(","); //The user's history should be saved in the form of action:goodsId:timestamp. To be reconstructed, bi should be written into a separate data table HashMap<Long, String> userTrace = null; try { userTrace = getUserTrace(userId); } catch (ClassNotFoundException e) { e.printStackTrace(); return sortedMap; } HashMap<Long, Double> recommendMap = new HashMap<Long, Double>(); String[] simGoodsIds = new String[simGoods.length]; for (int i = 0; i < simGoods.length; i++) { simGoodsIds[i] = simGoods[i].split(":")[0]; } List<String> pSimGoodsIds = RedisHelper.getInstance().hmget(dictKey, simGoodsIds); HashMap<Long, String> predictSimGoodsIds = new HashMap<Long, String>(); for (int i = 0; i < simGoodsIds.length; i++) { predictSimGoodsIds.put(Long.parseLong(simGoodsIds[i]), pSimGoodsIds.get(i)); } for (String item : simGoods) { //need optimised Double totalSum = 0.0; Double sum = 0.0; Long originGoodsId = Long.parseLong(item.split(":")[0]); for (String predictGoods : predictSimGoodsIds.get(originGoodsId).split(",")) { Long goodsId = Long.parseLong(predictGoods.split(":")[0].toString()); Double sim = Double.valueOf(predictGoods.split(":")[1].toString()); totalSum = totalSum + sim; Double score = 0.0; if (!userTrace.containsKey(goodsId)) { //The TODO user rating matrix is too sparse, and svd is needed to supplement the rating. If there is no score temporarily, the default score is 0.1 userTrace.put(goodsId, "default"); } String action = userTrace.get(goodsId); if (action.equals("click")) { score = 0.2; } else if (action.equals("favorate")) { } else if (action.equals("add_cart")) { score = 0.6; } else if (action.equals("order")) { score = 0.8; } else if (action.equals("default")) { score = 0.1; } //The similarity dictionary should be in good: Sim format and be reconstructed sum = sum + score * sim; } Double predictResult = sum / totalSum; recommendMap.put(originGoodsId, predictResult); } //sort recommend list List<Map.Entry<Long, Double>> list = new ArrayList<Map.Entry<Long, Double>>(recommendMap.entrySet()); Collections.sort(list, new Comparator<Map.Entry<Long, Double>>() { @Override public int compare(Map.Entry<Long, Double> o1, Map.Entry<Long, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); Map.Entry<Long, Double> tmpEntry = null; Iterator<Map.Entry<Long, Double>> iter = list.iterator(); while (iter.hasNext()) { tmpEntry = iter.next(); sortedMap.put(tmpEntry.getKey(), tmpEntry.getValue()); } writeRecommendListToRedis(userId, sortedMap); return sortedMap; } private HashMap<Long, String> getUserTrace(String userId) throws ClassNotFoundException { //SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); Class.forName("com.mysql.jdbc.Driver"); PreparedStatement stmt = null; Connection conn = null; UserTrace userTrace = new UserTrace(); try { conn = DriverManager.getConnection(HIGO_BASE_URL, HIGO_BASE_USER, HIGO_BASE_PASS); String sql = "select * from t_pandora_goods_record where account_id=" + userId; stmt = (PreparedStatement)conn.prepareStatement(sql); ResultSet rs = stmt.executeQuery(); while(rs.next()) { userTrace.setId(Long.parseLong(rs.getString(1))); userTrace.setAccountId(Long.parseLong(rs.getString(2))); userTrace.setGoodsIds(rs.getString(3)); userTrace.setMtime(rs.getString(4)); } stmt.close(); conn.close(); } catch (Exception e) { e.printStackTrace(); } String[] goodsActionTimestamp = userTrace.getGoodsIds().split(","); HashMap<Long, String> hm = new HashMap<Long, String>(); for (String ac : goodsActionTimestamp) { Long goodsId = Long.parseLong(ac.split(":")[0]); //String action = ac.split(":")[1]; //String timestamp = ac.split(":")[2]; //The next step of hack is to bi write the user's historical behavior into the table in the form of action:goodsId:timestamp, which will participate in weight calculation later String action = "click"; hm.put(goodsId, action); } return hm; } private void writeRecommendListToRedis(String userId, LinkedHashMap<Long, Double> sortedMap) { String recommendList = ""; int count = 0; for (Map.Entry<Long, Double> entry : sortedMap.entrySet()) { recommendList = recommendList + entry.getKey(); if (count == sortedMap.size() - 1) { break; } count = count + 1; recommendList = recommendList + ","; } RedisHelper.getInstance().hset(recommendTable, userId, recommendList); } }