About this Block
BigQuery allows for the creation of User Defined SQL function written in either SQL or Javascript. This block shows how to create and use a MEDIAN function written in SQL.
How it works:
LookML explores have a parameter sql_preamble: that allow you to specify sql statements that execute before the query is run. BigQuery has a CREATE TEMP FUNCTION statement that lets you create user defined functions. We have written a function computes median and added it to the explore.
Every query that runs from this explore will have the median function inserted before the SQL code for the query.
explore: sf_salary {
sql_preamble:
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
;;
}
Calling the Function
Functions in BigQuery can take Array of any type (read about arrays in BigQuery here). We first collect all the numeric values into an array, useing the ARRAY_AGG() and then pass the array to the MEDIAN() user defined function which scans the array and looks for the median value.
measure: average_base_pay {
type: average
sql: ${pay} ;;
value_format_name: "decimal_0"
}
measure: median_base_pay {
type: number
sql: MEDIAN(ARRAY_AGG(${pay})) ;;
value_format_name: "decimal_0"
}
The LookML Code
connection: "bigquery_publicdata_standard_sql"
explore: sf_salary {
sql_preamble:
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
;;
}
view: sf_salary {
sql_table_name: `lookerdata.sfsalary.salaries` ;;
measure: count {
type: count
drill_fields: [detail*]
}
dimension: id {
type: number
sql: ${TABLE}.Id ;;
}
dimension: employee_name {
type: string
sql: ${TABLE}.EmployeeName ;;
}
dimension: job_title {
type: string
sql: ${TABLE}.JobTitle ;;
}
# Pay is stored as a string, safely convert it to a float.
dimension: pay {
type: string
sql: CASE WHEN
REGEXP_CONTAINS(${TABLE}.TotalPay, r'^[\d\.]+$')
THEN CAST(${TABLE}.TotalPay AS FLOAT64)
END ;;
}
measure: average_base_pay {
type: average
sql: ${pay} ;;
value_format_name: "decimal_0"
}
measure: median_base_pay {
type: number
sql: MEDIAN(ARRAY_AGG(${pay})) ;;
value_format_name: "decimal_0"
}
dimension: year {
type: string
sql: ${TABLE}.Year ;;
}
dimension: status {
type: string
sql: ${TABLE}.Status ;;
}
set: detail {
fields: [
id,
employee_name,
job_title,
pay,
year,
status
]
}
}
The SQL for the Query
CREATE TEMP FUNCTION MEDIAN(a_num ARRAY<FLOAT64>)
RETURNS FLOAT64 AS ((
SELECT
AVG(num)
FROM (
SELECT
row_number() OVER (ORDER BY num) -1 as rn
, num
FROM UNNEST(a_num) num
)
WHERE
rn = TRUNC(ARRAY_LENGTH(a_num)/2)
OR (
MOD(ARRAY_LENGTH(a_num), 2) = 0 AND
rn = TRUNC(ARRAY_LENGTH(a_num)/2)-1 )
));
SELECT
sf_salary.JobTitle AS sf_salary_job_title,
COUNT(*) AS sf_salary_count,
MEDIAN(ARRAY_AGG((CASE WHEN
REGEXP_CONTAINS(sf_salary.TotalPay, r'^[\d\.]+$')
THEN CAST(sf_salary.TotalPay AS FLOAT64)
END))) AS sf_salary_median_base_pay,
AVG((CASE WHEN
REGEXP_CONTAINS(sf_salary.TotalPay, r'^[\d\.]+$')
THEN CAST(sf_salary.TotalPay AS FLOAT64)
END) ) AS sf_salary_average_base_pay
FROM `lookerdata.sfsalary.salaries` AS sf_salary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 500