The scope of action of elastic search aggregation

Keywords: Programming Java Javascript

test data

index structure

PUT /employees/
{
  "mappings" : {
      "properties" : {
        "age" : {
          "type" : "integer"
        },
        "gender" : {
          "type" : "keyword"
        },
        "job" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 50
            }
          }
        },
        "name" : {
          "type" : "keyword"
        },
        "salary" : {
          "type" : "integer"
        }
      }
    }
}

Insert 20 pieces of data

PUT /employees/_bulk
{ "index" : {  "_id" : "1" } }
{ "name" : "Emma","age":32,"job":"Product Manager","gender":"female","salary":35000 }
{ "index" : {  "_id" : "2" } }
{ "name" : "Underwood","age":41,"job":"Dev Manager","gender":"male","salary": 50000}
{ "index" : {  "_id" : "3" } }
{ "name" : "Tran","age":25,"job":"Web Designer","gender":"male","salary":18000 }
{ "index" : {  "_id" : "4" } }
{ "name" : "Rivera","age":26,"job":"Web Designer","gender":"female","salary": 22000}
{ "index" : {  "_id" : "5" } }
{ "name" : "Rose","age":25,"job":"QA","gender":"female","salary":18000 }
{ "index" : {  "_id" : "6" } }
{ "name" : "Lucy","age":31,"job":"QA","gender":"female","salary": 25000}
{ "index" : {  "_id" : "7" } }
{ "name" : "Byrd","age":27,"job":"QA","gender":"male","salary":20000 }
{ "index" : {  "_id" : "8" } }
{ "name" : "Foster","age":27,"job":"Java Programmer","gender":"male","salary": 20000}
{ "index" : {  "_id" : "9" } }
{ "name" : "Gregory","age":32,"job":"Java Programmer","gender":"male","salary":22000 }
{ "index" : {  "_id" : "10" } }
{ "name" : "Bryant","age":20,"job":"Java Programmer","gender":"male","salary": 9000}
{ "index" : {  "_id" : "11" } }
{ "name" : "Jenny","age":36,"job":"Java Programmer","gender":"female","salary":38000 }
{ "index" : {  "_id" : "12" } }
{ "name" : "Mcdonald","age":31,"job":"Java Programmer","gender":"male","salary": 32000}
{ "index" : {  "_id" : "13" } }
{ "name" : "Jonthna","age":30,"job":"Java Programmer","gender":"female","salary":30000 }
{ "index" : {  "_id" : "14" } }
{ "name" : "Marshall","age":32,"job":"Javascript Programmer","gender":"male","salary": 25000}
{ "index" : {  "_id" : "15" } }
{ "name" : "King","age":33,"job":"Java Programmer","gender":"male","salary":28000 }
{ "index" : {  "_id" : "16" } }
{ "name" : "Mccarthy","age":21,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : {  "_id" : "17" } }
{ "name" : "Goodwin","age":25,"job":"Javascript Programmer","gender":"male","salary": 16000}
{ "index" : {  "_id" : "18" } }
{ "name" : "Catherine","age":29,"job":"Javascript Programmer","gender":"female","salary": 20000}
{ "index" : {  "_id" : "19" } }
{ "name" : "Boone","age":30,"job":"DBA","gender":"male","salary": 30000}
{ "index" : {  "_id" : "20" } }
{ "name" : "Kathy","age":29,"job":"DBA","gender":"female","salary": 20000}

query

The default scope of ES aggregation analysis is the query result set of query, that is to say, aggregation is performed in the result after query,

# Query employees older than or equal to 30 years old, and group qualified employees by position type
POST /employees/_search
{
  "size": 3,
  "query": {
    "range": {
      "age": {
        "gte": 30
      }
    }
  },
  "aggs": {
    "jobs": {
      "terms": {
        "field": "job.keyword"
      }
    }
  }
}

Return result

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 10,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "name" : "Emma",
          "age" : 32,
          "job" : "Product Manager",
          "gender" : "female",
          "salary" : 35000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "name" : "Underwood",
          "age" : 41,
          "job" : "Dev Manager",
          "gender" : "male",
          "salary" : 50000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "6",
        "_score" : 1.0,
        "_source" : {
          "name" : "Lucy",
          "age" : 31,
          "job" : "QA",
          "gender" : "female",
          "salary" : 25000
        }
      }
    ]
  },
  "aggregations" : {
    "jobs" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Java Programmer",
          "doc_count" : 5
        },
        {
          "key" : "DBA",
          "doc_count" : 1
        },
        {
          "key" : "Dev Manager",
          "doc_count" : 1
        },
        {
          "key" : "Javascript Programmer",
          "doc_count" : 1
        },
        {
          "key" : "Product Manager",
          "doc_count" : 1
        },
        {
          "key" : "QA",
          "doc_count" : 1
        }
      ]
    }
  }
}

filter

If we want to filter only the aggregated data without affecting the results of query, or we want to filter only in one aggregation without affecting the results of other aggregations, then we can use filter

POST employees/_search
{
  "size": 3,
  "query": {
    "range": {
      "age": {
        "gte": 30
      }
    }
  },
  "aggs": {
    "older_person": {
      "filter": {
        "range": {
          "age": {
            "from": 35
          }
        }
      },
      "aggs": {
        "jobs": {
          "terms": {
            "field": "job.keyword"
          }
        }
      }
    },
    "all_jobs": {
      "terms": {
        "field": "job.keyword"
      }
    }
  }
}

The results are as follows:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 10,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "name" : "Emma",
          "age" : 32,
          "job" : "Product Manager",
          "gender" : "female",
          "salary" : 35000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "name" : "Underwood",
          "age" : 41,
          "job" : "Dev Manager",
          "gender" : "male",
          "salary" : 50000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "6",
        "_score" : 1.0,
        "_source" : {
          "name" : "Lucy",
          "age" : 31,
          "job" : "QA",
          "gender" : "female",
          "salary" : 25000
        }
      }
    ]
  },
  "aggregations" : {
    "older_person" : {
      "doc_count" : 2,
      "jobs" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Dev Manager",
            "doc_count" : 1
          },
          {
            "key" : "Java Programmer",
            "doc_count" : 1
          }
        ]
      }
    },
    "all_jobs" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Java Programmer",
          "doc_count" : 5
        },
        {
          "key" : "DBA",
          "doc_count" : 1
        },
        {
          "key" : "Dev Manager",
          "doc_count" : 1
        },
        {
          "key" : "Javascript Programmer",
          "doc_count" : 1
        },
        {
          "key" : "Product Manager",
          "doc_count" : 1
        },
        {
          "key" : "QA",
          "doc_count" : 1
        }
      ]
    }
  }
}

We can see that the result of query is the same as that of query above, and then the aggregation in the older person uses the conditions in the filter. The conditions in all jobs are the same as those in query above.

post_filter

What if you want the filter to apply only to queries, not aggregations? Use post filter

For example, I want to filter out employees younger than 23 years old and group them by position category, then find out employees younger than 35 years old and take the top five employees by id

POST employees/_search
{
  "size": 5,
  "aggs": {
    "young_person": {
      "filter": {
        "range": {
          "age": {
            "lte": 23
          }
        }
      },
      "aggs": {
        "jobs": {
          "terms": {
            "field": "job.keyword"
          }
        }
      }
    }
  },
  "post_filter": {
    "range": {
      "age": {
        "lte": 35
      }
    }
  }
}

The results are as follows:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 18,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "name" : "Emma",
          "age" : 32,
          "job" : "Product Manager",
          "gender" : "female",
          "salary" : 35000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 1.0,
        "_source" : {
          "name" : "Tran",
          "age" : 25,
          "job" : "Web Designer",
          "gender" : "male",
          "salary" : 18000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 1.0,
        "_source" : {
          "name" : "Rivera",
          "age" : 26,
          "job" : "Web Designer",
          "gender" : "female",
          "salary" : 22000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "5",
        "_score" : 1.0,
        "_source" : {
          "name" : "Rose",
          "age" : 25,
          "job" : "QA",
          "gender" : "female",
          "salary" : 18000
        }
      },
      {
        "_index" : "employees",
        "_type" : "_doc",
        "_id" : "6",
        "_score" : 1.0,
        "_source" : {
          "name" : "Lucy",
          "age" : 31,
          "job" : "QA",
          "gender" : "female",
          "salary" : 25000
        }
      }
    ]
  },
  "aggregations" : {
    "young_person" : {
      "doc_count" : 2,
      "jobs" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Java Programmer",
            "doc_count" : 1
          },
          {
            "key" : "Javascript Programmer",
            "doc_count" : 1
          }
        ]
      }
    }
  }
}

In this example, the filter conditions of post filter are not used in aggregation, so you can understand that post filter and query are a pair of opposite operations. Query filters and aggregates the aggregated data first, and post filter and aggregation are independent of each other.

global

The last one is global, which can ignore the impact of query. For example, we want to filter out employees older than 35 years old and group them by occupation type, and then we want to get the average salary of all employees.

POST /employees/_search
{
  "size": 0,
  "query": {
    "range": {
      "age": {
        "gte": 35
      }
    }
  },
  "aggs": {
    "jobs": {
      "terms": {
        "field": "job.keyword"
      }
    },
    "all": {
      "global": {},
      "aggs": {
        "salary_avg": {
          "avg": {
            "field": "salary"
          }
        }
      }
    }
  }
}

The results are as follows:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "all" : {
      "doc_count" : 20,
      "salary_avg" : {
        "value" : 24700.0
      }
    },
    "jobs" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Dev Manager",
          "doc_count" : 1
        },
        {
          "key" : "Java Programmer",
          "doc_count" : 1
        }
      ]
    }
  }
}

As we can see above, 24700.0 is the average salary of all employees, and then jobs includes the categories of employees older than 35 years old, namely Dev Manager and Java programmer.

Of course, to achieve the above requirements, we can also have other writing methods, here is just to show the use of global.

For example, the following code

POST /employees/_search
{
  "size": 0,
  "aggs": {
    "old_persons": {
        "filter": {
          "range": {
            "age": {
              "gte": 35
            }
          }
        },
        "aggs": {
          "jobs": {
            "terms": {
              "field": "job.keyword"
            }
          }
        }
    },
    "avg_salary": {
      "avg": {
        "field": "salary"
      }
    }
  }
}

The results are as follows:

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 20,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "old_persons" : {
      "doc_count" : 2,
      "jobs" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Dev Manager",
            "doc_count" : 1
          },
          {
            "key" : "Java Programmer",
            "doc_count" : 1
          }
        ]
      }
    },
    "avg_salary" : {
      "value" : 24700.0
    }
  }
}

What other people's blogs see, reprint it

Posted by Urbley on Mon, 02 Mar 2020 20:26:35 -0800