The windowing function of Spark

Keywords: Python Spark SQL Apache hive

I. Introduction

The window function row ou number() is a function that groups one field and then takes the first values sorted by another field, which is equivalent to grouping topN. If a windowing function is used in the SQL statement, the SQL statement must be executed with HiveContext.

II. Code practice [use HiveContext]

package big.data.analyse.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

/**
  * Created by zhen on 2019/7/6.
  */
object RowNumber {
  /**
    * Set log level
    */
  Logger.getLogger("org").setLevel(Level.WARN)
  def main(args: Array[String]) {
    /**
      * Create spark portal, support Hive
      */
    val spark = SparkSession.builder().appName("RowNumber")
      .master("local[2]").enableHiveSupport().getOrCreate()

    /**
      * Create test data
      */
    val array = Array("1,Hadoop,12","5,Spark,6","3,Solr,15","3,HBase,8","6,Hive,16","6,TensorFlow,26")

    val rdd = spark.sparkContext.parallelize(array).map{ row =>
      val Array(id, name, age) = row.split(",")
      Row(id, name, age.toInt)
    }

    val structType = new StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("age", IntegerType, true)
    ))

    /**
      * Converted to df
      */
    val df = spark.createDataFrame(rdd, structType)
    df.show()

    df.createOrReplaceTempView("technology")

    /**
      * Apply window function row Ou number
      * Note: windowing function can only be used under hiveContext
      */
    val result_1 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 1")
    result_1.show()

    val result_2 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 2")
    result_2.show()

    val result_3 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 3")
    result_3.show()

    val result_4 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top > 3")
    result_4.show()
  }
}

Results [use HiveContext]

1. Initial data

    

2. When top < = 1

    

3. When top < = 2

    

4. When top < = 3

    

5. When top > 3, the maximum number of groups is 3.

    

IV. code implementation [do not use HiveContext]

package big.data.analyse.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

/**
  * Created by zhen on 2019/7/6.
  */
object RowNumber {
  /**
    * Set log level
    */
  Logger.getLogger("org").setLevel(Level.WARN)
  def main(args: Array[String]) {
    /**
      * Create spark entry, Hive not supported
      */
    val spark = SparkSession.builder().appName("RowNumber")
      .master("local[2]").getOrCreate()

    /**
      * Create test data
      */
    val array = Array("1,Hadoop,12","5,Spark,6","3,Solr,15","3,HBase,8","6,Hive,16","6,TensorFlow,26")

    val rdd = spark.sparkContext.parallelize(array).map{ row =>
      val Array(id, name, age) = row.split(",")
      Row(id, name, age.toInt)
    }

    val structType = new StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("age", IntegerType, true)
    ))

    /**
      * Converted to df
      */
    val df = spark.createDataFrame(rdd, structType)
    df.show()

    df.createOrReplaceTempView("technology")

    /**
      * Apply window function row Ou number
      * Note: windowing function can only be used under hiveContext
      */
    val result_1 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 1")
    result_1.show()

    val result_2 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 2")
    result_2.show()

    val result_3 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top <= 3")
    result_3.show()

    val result_4 = spark.sql("select id,name,age from (select id,name,age," +
      "row_number() over (partition by id order by age desc) top from technology) t where t.top > 3")
    result_4.show()
  }
}

V. results [do not use HiveContext]

  

Posted by nezbo on Fri, 01 Nov 2019 16:08:24 -0700