[Analytic Block] Building a TopN measure in BigQuery

  • 26 January 2017
  • 0 replies
  • 391 views

Userlevel 6
Badge

Building a TopN measure in BigQuery


BigQuery’s User Defined Functions are really powerful. In this block we’re going to build a new reusable TOPN measure that can help us see all kinds of things in data.


The Names Data


We’ll be working with a names database that contains Name, gender, year, state and the number of people with those attributes that were born in with that name, gender in that state, year.


The measure


The measure declared in the LookML model is declared below. The pairs_sum_top_n takes an array of keys and a values. In this case, the key being name and the value being the number of people born.


  measure: top_5_names {
type: string
sql: pairs_sum_top_n(ARRAY_AGG(STRUCT(${name} as key, ${number} as value)), 5) ;;
}

First, what is it for.


Like any measure, the TopN can show you something about a set of data. Let’s look at the most basic query. We can see that the total population of the data set and the top 5 names. James is 1.67% of the total population.



Gender


We can group by any dimension, so we can easily see this by gender. Adding Gender to the query below. We see that Mary is the most popular female name being 2.62% of the population.



Decade Pivoting Gender


The measure really gets powerful when we increase the dimensionalization. We’re going to add decade and pivot out Gender



Filtering


Of course, like any measure, you can filter the data. Let’s look at names by Decade and Gender that start with the letter ‘J’



Filtered Pivoting


We can also filter by state and compare names across several different states and time. Here are female names by decade and state.



The Code and how it works.


The data model that generated the above data is in the code below. The LookML part with the exception of the measure declartion is pretty simple. The entire code is listed here for reference.


connection: "bigquery_publicdata_standard_sql"

persist_for: "24 hours"

explore: custom_functions {
extension: required
sql_preamble:

-- take a dimension, number pair and aggregate as a sum
CREATE TEMP FUNCTION pairs_sum(a ARRAY<STRUCT<key STRING, value FLOAT64>>)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY_AGG(STRUCT(key,total_value as value))
FROM (
SELECT
key
, SUM(value) as total_value
FROM UNNEST(a)
GROUP BY 1
ORDER BY 2 DESC
)
));

-- convert a array to a shortened array with an 'Other'. Keep the ordering by Num and make other last
-- by using a row number.
CREATE TEMP FUNCTION pairs_top_n(a ARRAY<STRUCT<key STRING, value FLOAT64>>, n INT64, use_other BOOL)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY(
SELECT
STRUCT(key2 as key ,value2 as value)
FROM (
SELECT
CASE WHEN rn <= n THEN key ELSE 'Other' END as key2
, CASE WHEN rn <= n THEN n ELSE n + 1 END as n2
, SUM(value) as value2
FROM (
SELECT
ROW_NUMBER() OVER() as rn
, *
FROM UNNEST(a)
ORDER BY value DESC
)
GROUP BY 1,2
ORDER BY 2
) as t
WHERE key2 <> 'Other' or use_other
ORDER BY n2
)
));


-- take a set of string, number pairs and convert the number to percentage of max or total
-- pass 'total' or 'max' as type to change behaviour
CREATE TEMP FUNCTION pairs_convert_percentage(a ARRAY<STRUCT<key STRING, value FLOAT64>>,type STRING)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY_AGG(STRUCT(key,new_value as value))
FROM (
SELECT
key
, 100.0*value/total
as new_value
FROM UNNEST(a)
CROSS JOIN (
SELECT
CASE
WHEN type='total' THEN SUM(b.value)
WHEN type='max' THEN MAX(b.value)
END
as total FROM UNNEST(a) as b
) as t
ORDER BY 2 DESC
)
));

-- formats a STR N into String(number)
CREATE TEMP FUNCTION format_result(key STRING, value FLOAT64, format_str STRING)
RETURNS STRING AS ((
SELECT
CONCAT(key, '(',
CASE
WHEN format_str = 'decimal_0'
THEN FORMAT("%0.0f", value)
WHEN format_str = 'percent_0'
THEN FORMAT("%0.2f%%", value)
END,
')' )
));

-- convert pairs into a string ('Other' is always last)
CREATE TEMP FUNCTION pairs_to_string(a ARRAY<STRUCT<key STRING, value FLOAT64>>, format_str STRING)
RETURNS STRING AS ((
SELECT
STRING_AGG(value2,", ")
FROM (
SELECT (
format_result(key,value,format_str)) as value2
,rn
FROM (
SELECT
ROW_NUMBER() OVER (ORDER BY CASE WHEN key='Other' THEN -1 ELSE value END DESC) as rn
, *
FROM
UNNEST(a)
)
ORDER BY rn
)
));

-- take pairs sum, topn then and convert to a string
CREATE TEMP FUNCTION pairs_sum_top_n(a ARRAY<STRUCT<key STRING, value FLOAT64>>, n INT64)
RETURNS STRING AS ((
pairs_to_string( pairs_top_n(pairs_convert_percentage(pairs_sum(a),'total'), n, true), 'percent_0' )
));

;;
}

connection: "bigquery_publicdata_standard_sql"
include: "*.view.lkml"
explore: names {
hidden: yes
extends: [custom_functions]
persist_for: "24 hours"
}

view: names {
sql_table_name: `fh-bigquery.popular_names.usa_1910_2013`
;;

measure: top_5_names {
type: string
sql: pairs_sum_top_n(ARRAY_AGG(STRUCT(${name} as key, ${number} as value)), 5) ;;
}

dimension: state {
type: string
sql: ${TABLE}.state ;;
}

dimension: gender {
type: string
sql: ${TABLE}.gender ;;
}

dimension: year {
type: number
}

dimension: decade {
type: number
sql: CAST(FLOOR(${year}/10)*10 AS INT64) ;;
}

dimension: name {
type: string
sql: ${TABLE}.name ;;
}

dimension: first_letter {
sql: SUBSTR(${name},1,1) ;;
}

dimension: number {
type: number
sql: CAST(${TABLE}.number AS FLOAT64) ;;
}

measure: total_number {
type: sum
sql: ${number} ;;
}

set: detail {
fields: [state, gender, year, name, number]
}
}

The Measure


Key value pairs (name, population) are collected into an array of structs using BigQuery’s ARRAY_AGG function. The array is passed to the to the pairs_sum_top_n function.


  measure: top_5_names {
type: string
sql: pairs_sum_top_n(ARRAY_AGG(STRUCT(${name} as key, ${number} as value)), 5) ;;
}


How it works


This function performs 4 steps. All of the ‘pairs’ functions operate on arrays of (key,value) pairs.




  • pairs_sum - Aggregate the data from raw key values (where the keys can be repeated into key, sum(value)


  • pairs_convert_percentage - convert the sum(value) into a percent of total


  • pairs_top_n - take the (key, percentage) array and convert into an array of length N with an ‘other’ summing the rest.


  • pairs_convert_string - take the key value pairs and convert it to a string.


     -- take pairs sum, topn then and convert to a string
CREATE TEMP FUNCTION pairs_sum_top_n(a ARRAY<STRUCT<key STRING, value FLOAT64>>, n INT64)
RETURNS STRING AS ((
pairs_to_string( pairs_top_n(pairs_convert_percentage(pairs_sum(a),'total'), n, true), 'percent_0' )
));

Just the SQL


Below is the Query Looker wrote for TopN names by Decade.


-- take a dimension, number pair and aggregate as a sum
CREATE TEMP FUNCTION pairs_sum(a ARRAY<STRUCT<key STRING, value FLOAT64>>)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY_AGG(STRUCT(key,total_value as value))
FROM (
SELECT
key
, SUM(value) as total_value
FROM UNNEST(a)
GROUP BY 1
ORDER BY 2 DESC
)
));

-- convert a array to a shortened array with an 'Other'. Keep the ordering by Num and make other last
-- by using a row number.
CREATE TEMP FUNCTION pairs_top_n(a ARRAY<STRUCT<key STRING, value FLOAT64>>, n INT64, use_other BOOL)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY(
SELECT
STRUCT(key2 as key ,value2 as value)
FROM (
SELECT
CASE WHEN rn <= n THEN key ELSE 'Other' END as key2
, CASE WHEN rn <= n THEN n ELSE n + 1 END as n2
, SUM(value) as value2
FROM (
SELECT
ROW_NUMBER() OVER() as rn
, *
FROM UNNEST(a)
ORDER BY value DESC
)
GROUP BY 1,2
ORDER BY 2
) as t
WHERE key2 <> 'Other' or use_other
ORDER BY n2
)
));


-- take a set of string, number pairs and convert the number to percentage of max or total
-- pass 'total' or 'max' as type to change behaviour
CREATE TEMP FUNCTION pairs_convert_percentage(a ARRAY<STRUCT<key STRING, value FLOAT64>>,type STRING)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS ((
SELECT
ARRAY_AGG(STRUCT(key,new_value as value))
FROM (
SELECT
key
, 100.0*value/total
as new_value
FROM UNNEST(a)
CROSS JOIN (
SELECT
CASE
WHEN type='total' THEN SUM(b.value)
WHEN type='max' THEN MAX(b.value)
END
as total FROM UNNEST(a) as b
) as t
ORDER BY 2 DESC
)
));

-- formats a STR N into String(number)
CREATE TEMP FUNCTION format_result(key STRING, value FLOAT64, format_str STRING)
RETURNS STRING AS ((
SELECT
CONCAT(key, '(',
CASE
WHEN format_str = 'decimal_0'
THEN FORMAT("%0.0f", value)
WHEN format_str = 'percent_0'
THEN FORMAT("%0.2f%%", value)
END,
')' )
));

-- convert pairs into a string ('Other' is always last)
CREATE TEMP FUNCTION pairs_to_string(a ARRAY<STRUCT<key STRING, value FLOAT64>>, format_str STRING)
RETURNS STRING AS ((
SELECT
STRING_AGG(value2,", ")
FROM (
SELECT (
format_result(key,value,format_str)) as value2
,rn
FROM (
SELECT
ROW_NUMBER() OVER (ORDER BY CASE WHEN key='Other' THEN -1 ELSE value END DESC) as rn
, *
FROM
UNNEST(a)
)
ORDER BY rn
)
));

-- take pairs sum, topn then and convert to a string
CREATE TEMP FUNCTION pairs_sum_top_n(a ARRAY<STRUCT<key STRING, value FLOAT64>>, n INT64)
RETURNS STRING AS ((
pairs_to_string( pairs_top_n(pairs_convert_percentage(pairs_sum(a),'total'), n, true), 'percent_0' )
));


SELECT
CAST(FLOOR(names.year/10)*10 AS INT64) AS names_decade,
pairs_sum_top_n(ARRAY_AGG(STRUCT(names.name as key, (CAST(names.number AS FLOAT64)) as value)), 5) AS names_top_5_names
FROM `fh-bigquery.popular_names.usa_1910_2013`
AS names

GROUP BY 1
ORDER BY 2 DESC
LIMIT 500

0 replies

Be the first to reply!

Reply