Statistics of top3 Hot Search Terms in Spark-SQL

Keywords: Spark Apache SQL Java

Implementation ideas

1. Get input RDD for raw data (HDFS file)

2. The filter operator is used to filter the data in the input RDD and filter out the data that meet the query conditions.

2.1 Common practice: directly in fitler operator function, use the external query condition (Map), but if this is done, the query condition Map will be sent to a copy of each task. (poor performance)

2.2 After optimization, the query condition is encapsulated as Broadcast variable, and Broadcast variable is used to filter data in filter operator.

3. Convert the data into the format of'(date_search word, user)'. Then, group it and map it again. Reduplicate the search user of each search word every day, and count the number after the duplicate, that is, the UV of each search word every day. Finally, get "(date_search term, uv)"

4. Map the obtained uv, RDD of each search word per day to RDD of element type Row, and convert the RDD to DataFrame.

5. Register the DataFrame as a temporary table, and use the window function of Spark SQL to count the top three search terms in the number of UVs per day, as well as its search uv. Finally, get a DataFrame.

6. Convert the DataFrame to RDD, continue to operate, grouping according to the date of each day, and mapping, calculate the total number of search UVs for top3 search terms every day, and then use the total number of UVs as key, splice the search words and number of searches for top3 every day into a string.

7. Search the total uv by top3 every day, sort and reverse

8. Map the ordered data back again into the format of "date_search word_uv"

9. Map to DataFrame again and save the data to Hive

package Spark_SQL.Hive_sql;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.*;

/**
 * @Date: 2019/3/18 13:20
 * @Author Angle
 */

/*
* Daily top3 Hot Search Words Case Statistics
*
* */

public class DailyTop3Keyword {

    public static void main(String[] args){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("Top3Keyword");
        JavaSparkContext sc = new JavaSparkContext(conf);
//        SQLContext sqlContext = new SQLContext(sc);
        final HiveContext sqlContext = new HiveContext(sc.sc());


        //"Falsified Data" as map Query Conditions
        // In practical enterprise project development, this query condition is likely to be inserted into a MySQL table through J2EE platform.
        // Usually Spring and ORM frameworks are used to extract query conditions from MySQL tables.
        Map<String, List<String>> queryParamMap = new HashMap<String, List<String>>();
        //AsList - > Converts String into a Collection
        queryParamMap.put("city", Arrays.asList("beijing"));
        queryParamMap.put("platform",Arrays.asList("android"));
        queryParamMap.put("version",Arrays.asList("1.0"));

        //Then, according to the most suitable way of thinking, the query parameter Map is encapsulated as a Broadcast broadcast variable.
        //This can be optimized - > one for each worker
        final Broadcast<Map<String,List<String>>> queryParaMapBroadcast = sc.broadcast(queryParamMap);

        //For log files in hdfs, get RDD (you can also read local files)
        final JavaRDD<String> rowRDD = sc.textFile("hdfs://master:9000/sparkroot/keyword.txt");
//        final JavaRDD<String> rowRDD = sc.textFile("E:\\IDEA\\textFile\\hive\\keyword.txt");

        //Use the query number Map broadcast variable to filter (filter out columns that do not contain information for the specified column)
        JavaRDD<String> filterRDD = rowRDD.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String log) throws Exception {
                String[] rowLog = log.split(":");

                String city = rowLog[3];
                String platform = rowLog[4];
                String version = rowLog[5];

                //Comparing with query (filter) condition
                //As long as the condition is set, but the log is not satisfied, it returns to false for filtering. Return true on conformity
                Map<String, List<String>> querParaMapValue = queryParaMapBroadcast.value();

                //Judging filter conditions in turn
                //Here get() gets the value in HashMap, so key needs quotation marks.
                List<String> cities = querParaMapValue.get("city");
                if(cities.size()>0 && !cities.contains(city)){
                    return false;
                }


                //Filter out versions that are not android
                List<String> platforms = querParaMapValue.get("platform");
                if (platforms.size()>0 && !platforms.contains(platform)){
                    return false;
                }

                //Filter out versions that are not 1.0
                List<String> versions = querParaMapValue.get("version");
                if (versions.size()>0 && !versions.contains(version)){
                    return false;
                }

                return true;
            }
        });

        //The filtered data is mapped to (date_search term, user) format (convenient for user to de-duplicate below)
        JavaPairRDD<String,String> dateKeywordUserRDD = filterRDD.mapToPair(new PairFunction<String, String, String>() {
            @Override
            public Tuple2<String, String> call(String log) throws Exception {
                String[] logSplit = log.split(":");
                String date = logSplit[0];
                String keyword = logSplit[1];
                String user = logSplit[2];

                //Stitching fields
                return new Tuple2<String, String>(date + "_" + keyword,user);
            }
        });

        //Grouping - > Which users have searched for each search term every day (not duplicated)
        JavaPairRDD<String, Iterable<String>> DataKeywordUserRDD = dateKeywordUserRDD.groupByKey();

        //Reduplicate each search word per day to get UV
        JavaPairRDD<String,Long> dateKeywordUvRDD = DataKeywordUserRDD.mapToPair(
                new PairFunction<Tuple2<String, Iterable<String>>, String, Long>() {
            @Override
            public Tuple2<String, Long> call(Tuple2<String, Iterable<String>> Tuple) throws Exception {

                String dateKeyword = Tuple._1;
                Iterator<String> users = Tuple._2.iterator();

                //Deduplicate operations and count the number of deduplicates
                List<String> distenctUsers = new ArrayList<String>();

                while (users.hasNext()){
                    String user = users.next();
                    //If you don't include the current user, add it.
                    if(!distenctUsers.contains(user)){
                        distenctUsers.add(user);
                    }
                }

                //Get UV
                long uv = distenctUsers.size();
                //Return to the uv obtained after de-duplication
                return new Tuple2<String, Long>(dateKeyword,uv);
            }
        });

        //Converting uv data to DataFrame (registering temporary tables for sql operations)
        //First converts the RDD into Row
        JavaRDD<Row> dateKeywordUvRowRDD = dateKeywordUvRDD.map(new Function<Tuple2<String, Long>, Row>() {
            @Override
            public Row call(Tuple2<String, Long> dateKeywordUv) throws Exception {
                String date = dateKeywordUv._1.split("_")[0];
                String keyword = dateKeywordUv._1.split("_")[1];
                Long uv = dateKeywordUv._2;

                return RowFactory.create(date,keyword,uv);
            }
        });

        //Constructing metadata
        List<StructField> structFields = Arrays.asList(
                DataTypes.createStructField("date",DataTypes.StringType,true),
                DataTypes.createStructField("keyword",DataTypes.StringType,true),
                DataTypes.createStructField("uv",DataTypes.LongType,true));
        StructType structType = DataTypes.createStructType(structFields);

        //The second way
//        List<StructField> structField = new ArrayList<StructField>();
//        structField.add(DataTypes.createStructField("date",DataTypes.StringType,true));
//        structField.add(DataTypes.createStructField("keyword",DataTypes.StringType,true));
//        structField.add(DataTypes.createStructField("uv",DataTypes.LongType,true));
//        StructType structType = DataTypes.createStructType(structField);

        Dataset<Row> dateKeywordUvDF = sqlContext.createDataFrame(dateKeywordUvRowRDD, structType);

        //Use Spark SQL window opening function to count the top three hotspots in UV ranking every day
        dateKeywordUvDF.registerTempTable("keyword_uv");

        Dataset<Row> dateKeywordTop3DF = sqlContext.sql("" +
                "select date,keyword,uv " +
                "from (" +
                    "select date,keyword,uv,row_number() over (partition by date order by uv desc) rank " +
                    "from keyword_uv" +
                ") tmp " +
                "where rank<=3");


        //DataFrame - > RDD (maoToPair operation), and then map (sorted by date) to calculate the total number of searches per day top3 uv
        JavaRDD<Row> dateKeywordTop3RDD = dateKeywordTop3DF.javaRDD();

        JavaPairRDD<String,String> Top3DateKeywordUvRDD = dateKeywordTop3RDD.mapToPair(new PairFunction<Row, String, String>() {
            @Override
            public Tuple2<String, String> call(Row row) throws Exception {
                String date = String.valueOf(row.get(0));
                String keyword = String.valueOf(row.get(1));
                Long uv = Long.valueOf(String.valueOf(row.get(2)));

                return new Tuple2<String,String>(date,keyword + "_" + uv);
            }
        });

        JavaPairRDD<String, Iterable<String>> Top3DateKeyWordsUvRDD = Top3DateKeywordUvRDD.groupByKey();

        //Mapping
        JavaPairRDD<Long,String> uvDateKeyWordRDD =  Top3DateKeyWordsUvRDD.mapToPair(new PairFunction<Tuple2<String, Iterable<String>>, Long, String>() {
            @Override
            public Tuple2<Long, String> call(Tuple2<String, Iterable<String>> tuple) throws Exception {

                String date = tuple._1;
                Long total = 0L;
                String dateKeywords = date;

                Iterator<String> keywordUvIterator = tuple._2.iterator();

                while (keywordUvIterator.hasNext()){
                    String keywordUv = keywordUvIterator.next();
                    Long uv = Long.valueOf(keywordUv.split("_")[1]);
                    total += uv;

//                    dateKeywords += "," + keywordUv.split("_")[0] ;
                    dateKeywords += "," + keywordUv;
                }

                return new Tuple2<Long, String>(total,dateKeywords);
            }
        });

        //Retrospective sorting of daily total uv (easy to select Top3)
        JavaPairRDD<Long,String> sortUvDateKeywordRDD = uvDateKeyWordRDD.sortByKey(false);

        //Mapping again, the sorted data should be returned to the original format Iterable < Row >
        JavaRDD<Row> sortRowRDD = sortUvDateKeywordRDD.flatMap(new FlatMapFunction<Tuple2<Long, String>, Row>() {
            @Override
            public Iterator<Row> call(Tuple2<Long, String> tuple) throws Exception {
                String dateKeywords = tuple._2;
                String[] dateKeywordsSpilt = dateKeywords.split(",");

                String data = dateKeywordsSpilt[0];

                List<Row> rows = new ArrayList<Row>();
                //Here, 225 lines of unshared keyword and uv are split. Here we get the word Top3.
                rows.add(RowFactory.create(
                        data,dateKeywordsSpilt[1].split("_")[0],Long.valueOf(dateKeywordsSpilt[1].split("_")[1])));
                rows.add(RowFactory.create(
                        data,dateKeywordsSpilt[2].split("_")[0],Long.valueOf(dateKeywordsSpilt[2].split("_")[1])));
                rows.add(RowFactory.create(
                        data,dateKeywordsSpilt[3].split("_")[0],Long.valueOf(dateKeywordsSpilt[3].split("_")[1])));

                return rows.iterator();
            }
        });

        //The final takeover is converted to a DataFrame and saved to Hive
        Dataset<Row> finalDF = sqlContext.createDataFrame(sortRowRDD,structType);
        finalDF.write().saveAsTable("keyword_uv_top3");

        //Local operation
//        Dataset<Row> finalDF = sqlContext.createDataFrame(sortRowRDD,structType);
//        finalDF.rdd().saveAsTextFile("E:\\IDEA\\textFile\\hive\\keywordUV.txt");


        sc.close();
    }
}

 

Posted by manx on Sat, 04 May 2019 09:10:39 -0700