恰巧適逢自己轉職的需求,需要熟悉PostgreSQL資料庫系統,故本學習筆記僅記錄PostgreSQL的語法,並僅收錄「第三章:使用SQL整理數據」以及「第四章:營收狀態相關數據的彙總、分析」的說明範例,其餘後續章節探討的議題如使用者行為、網站指標、精確度、進階數據活用術等,則因其聚焦於某特定需求,加上鮮少使用新的語法,故決定不整理、不收錄至此筆記,俟未來有使用需求,才會再重新借閱、閱讀此書。
一、使用SQL整理數據
單一數值的整理
l 將編碼值轉換成標籤(CASE語句):
n 範例資料表:
DROP TABLE IF EXISTS mst_users;
CREATE TABLE mst_users(
user_id varchar(255)
, register_date varchar(255)
, register_device integer
);
INSERT INTO mst_users
VALUES
('U001', '2016-08-26', 1)
, ('U002', '2016-08-26', 2)
, ('U003', '2016-08-27', 3)
;
DROP TABLE IF EXISTS mst_users;
CREATE TABLE mst_users(
user_id varchar(255)
, register_date varchar(255)
, register_device integer
);
INSERT INTO mst_users
VALUES
('U001', '2016-08-26', 1)
, ('U002', '2016-08-26', 2)
, ('U003', '2016-08-27', 3)
;
n 將編碼值置換成標籤的Query:
SELECT
user_id
, CASE
WHEN register_device = 1 THEN 'PC'
WHEN register_device = 2 THEN 'SP'
WHEN register_device = 3 THEN 'APP'
-- 要指定預設值時使用ELSE句
-- ELSE ''
END AS device_name
FROM mst_users
;
SELECT
user_id
, CASE
WHEN register_device = 1 THEN 'PC'
WHEN register_device = 2 THEN 'SP'
WHEN register_device = 3 THEN 'APP'
-- 要指定預設值時使用ELSE句
-- ELSE ''
END AS device_name
FROM mst_users
;
l 從URL擷取元素(URL函數、正規表達式):
n 範例資料表:
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
stamp varchar(255)
, referrer text
, url text
);
INSERT INTO access_log
VALUES
('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
, ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref' )
, ('2016-08-26 12:02:01', 'https://www.other.com/' , 'http://www.example.com/book/detail?id=002' )
;
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
stamp varchar(255)
, referrer text
, url text
);
INSERT INTO access_log
VALUES
('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
, ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref' )
, ('2016-08-26 12:02:01', 'https://www.other.com/' , 'http://www.example.com/book/detail?id=002' )
;
n 從Referrer擷取網域(domain)的Query:
SELECT
stamp
-- 擷取referer的主機名稱
, substring(referrer from 'https?://([^/]*)') AS referrer_host
FROM access_log
;
SELECT
stamp
-- 擷取referer的主機名稱
, substring(referrer from 'https?://([^/]*)') AS referrer_host
FROM access_log
;
n 擷取URL的路徑和GET參數中的特定值:
SELECT
stamp
, url
-- 擷取URL的路徑和GET參數的id值
, substring(url from '//[^/]+([^?#]+)') AS path
, substring(url from 'id=([^&]*)') AS id
FROM access_log
;
SELECT
stamp
, url
-- 擷取URL的路徑和GET參數的id值
, substring(url from '//[^/]+([^?#]+)') AS path
, substring(url from 'id=([^&]*)') AS id
FROM access_log
;
l 將字串分割至陣列(split_part函數、split函數):
n 範例資料表:
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
stamp varchar(255)
, referrer text
, url text
);
INSERT INTO access_log
VALUES
('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
, ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref' )
, ('2016-08-26 12:02:01', 'https://www.other.com/' , 'http://www.example.com/book/detail?id=002' )
;
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
stamp varchar(255)
, referrer text
, url text
);
INSERT INTO access_log
VALUES
('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
, ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref' )
, ('2016-08-26 12:02:01', 'https://www.other.com/' , 'http://www.example.com/book/detail?id=002' )
;
n 用斜線從URL的path中擷取出階層資訊:
SELECT
stamp
, url
-- 利用斜線分割Path,依階層分別擷取出階層名稱
-- 由於Path一定由斜線開頭,所以第二個元素就會是起始的階層
, split_part(substring(url from '//[^/]+([^?#]+)'), '/', 2) AS path1
, split_part(substring(url from '//[^/]+([^?#]+)'), '/', 3) AS path2
FROM access_log
;
SELECT
stamp
, url
-- 利用斜線分割Path,依階層分別擷取出階層名稱
-- 由於Path一定由斜線開頭,所以第二個元素就會是起始的階層
, split_part(substring(url from '//[^/]+([^?#]+)'), '/', 2) AS path1
, split_part(substring(url from '//[^/]+([^?#]+)'), '/', 3) AS path2
FROM access_log
;
l 日期與時間戳記的處理(日期/時間函數、date型別、timestamp型別、字串函數):
n 取得現在日期與時間戳記的Query:
SELECT
-- 使用CURRENT_DATE常數和CURRENT_TIMESTAMP常數
CURRENT_DATE AS dt
, CURRENT_TIMESTAMP AS stamp
-- 希望取得不含時區的timestamp時, 請使用LOCALTIMESTAMP
-- , LOCALTIMESTAMP AS stamp
;
SELECT
-- 使用CURRENT_DATE常數和CURRENT_TIMESTAMP常數
CURRENT_DATE AS dt
, CURRENT_TIMESTAMP AS stamp
-- 希望取得不含時區的timestamp時, 請使用LOCALTIMESTAMP
-- , LOCALTIMESTAMP AS stamp
;
n 將字串轉換成日期型別、timestamp型別的Query:
SELECT
-- CAST(value AS type)的形式來轉換資料型別
CAST('2016-01-30' AS date) AS dt
, CAST('2016-01-30 12:00:00' AS timestamp) AS stamp
-- 可使用type value的形式來轉換資料型別
-- 但是value必須為常數,不能設定為欄位名稱
-- date '2016-01-30' AS dt
-- , timestamp '2016-01-30 12:00:00' AS stamp
-- 可使用value::type的形式來轉換資料型別
-- '2016-01-30'::date AS dt
-- , '2016-01-30 12:00:00'::timestamp AS stamp
;
SELECT
-- CAST(value AS type)的形式來轉換資料型別
CAST('2016-01-30' AS date) AS dt
, CAST('2016-01-30 12:00:00' AS timestamp) AS stamp
-- 可使用type value的形式來轉換資料型別
-- 但是value必須為常數,不能設定為欄位名稱
-- date '2016-01-30' AS dt
-- , timestamp '2016-01-30 12:00:00' AS stamp
-- 可使用value::type的形式來轉換資料型別
-- '2016-01-30'::date AS dt
-- , '2016-01-30 12:00:00'::timestamp AS stamp
;
n 從timestamp型別的資料中取得年、月、日...等特定單位的值:
SELECT
stamp
, EXTRACT(YEAR FROM stamp) AS year
, EXTRACT(MONTH FROM stamp) AS month
, EXTRACT(DAY FROM stamp) AS day
, EXTRACT(HOUR FROM stamp) AS hour
FROM
(SELECT CAST('2016-01-30 12:00:00' AS timestamp) AS stamp) AS t
;
SELECT
stamp
, EXTRACT(YEAR FROM stamp) AS year
, EXTRACT(MONTH FROM stamp) AS month
, EXTRACT(DAY FROM stamp) AS day
, EXTRACT(HOUR FROM stamp) AS hour
FROM
(SELECT CAST('2016-01-30 12:00:00' AS timestamp) AS stamp) AS t
;
n 從代表timestamp的字串中取得年、月、日...等值:
SELECT
stamp
-- 下列substring函數皆可改為使用substr函數
, substring(stamp, 1, 4) AS year
, substring(stamp, 6, 2) AS month
, substring(stamp, 9, 2) AS day
, substring(stamp, 12, 2) AS hour
-- 同時擷取年和月
, substring(stamp, 1, 7) AS year_month
FROM
(SELECT CAST('2016-01-30 12:00:00' AS text) AS stamp) AS t
;
SELECT
stamp
-- 下列substring函數皆可改為使用substr函數
, substring(stamp, 1, 4) AS year
, substring(stamp, 6, 2) AS month
, substring(stamp, 9, 2) AS day
, substring(stamp, 12, 2) AS hour
-- 同時擷取年和月
, substring(stamp, 1, 7) AS year_month
FROM
(SELECT CAST('2016-01-30 12:00:00' AS text) AS stamp) AS t
;
l 以預設值取代缺損的資料(COALESCE函數):
n 範例資料表:
DROP TABLE IF EXISTS purchase_log_with_coupon;
CREATE TABLE purchase_log_with_coupon (
purchase_id varchar(255)
, amount integer
, coupon integer
);
INSERT INTO purchase_log_with_coupon
VALUES
('10001', 3280, NULL)
, ('10002', 4650, 500)
, ('10003', 3870, NULL)
;
DROP TABLE IF EXISTS purchase_log_with_coupon;
CREATE TABLE purchase_log_with_coupon (
purchase_id varchar(255)
, amount integer
, coupon integer
);
INSERT INTO purchase_log_with_coupon
VALUES
('10001', 3280, NULL)
, ('10002', 4650, 500)
, ('10003', 3870, NULL)
;
n 計算折價後的銷售金額:
SELECT
purchase_id
, amount
, coupon
, amount - coupon AS discount_amount1
, amount - COALESCE(coupon, 0) AS discount_amount2
FROM purchase_log_with_coupon
;
SELECT
purchase_id
, amount
, coupon
, amount - coupon AS discount_amount1
, amount - COALESCE(coupon, 0) AS discount_amount2
FROM purchase_log_with_coupon
;
多個數值的整理
l 連接字串(CONCAT函數、||運算子):
n 範例資料表:
DROP TABLE IF EXISTS mst_user_location;
CREATE TABLE mst_user_location (
user_id varchar(255)
, pref_name varchar(255)
, city_name varchar(255)
);
INSERT INTO mst_user_location
VALUES
('U001', '東京都', '千代田區')
, ('U002', '東京都', '涉谷區' )
, ('U003', '千葉縣', '八千代市')
;
DROP TABLE IF EXISTS mst_user_location;
CREATE TABLE mst_user_location (
user_id varchar(255)
, pref_name varchar(255)
, city_name varchar(255)
);
INSERT INTO mst_user_location
VALUES
('U001', '東京都', '千代田區')
, ('U002', '東京都', '涉谷區' )
, ('U003', '千葉縣', '八千代市')
;
n 連接字串的Query:
SELECT
user_id
, CONCAT(pref_name, city_name) AS pref_city
-- 也可使用||運算子
-- , pref_name || city_name AS pref_city
FROM mst_user_location
;
SELECT
user_id
, CONCAT(pref_name, city_name) AS pref_city
-- 也可使用||運算子
-- , pref_name || city_name AS pref_city
FROM mst_user_location
;
l 比較多個數值(CASE語法、SIGN函數、greatest函數、least函數、四則運算子):
n 範例資料表:
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
year integer
, q1 integer
, q2 integer
, q3 integer
, q4 integer
);
INSERT INTO quarterly_sales
VALUES
(2015, 82000, 83000, 78000, 83000)
, (2016, 85000, 85000, 80000, 81000)
, (2017, 92000, 81000, NULL , NULL )
;
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
year integer
, q1 integer
, q2 integer
, q3 integer
, q4 integer
);
INSERT INTO quarterly_sales
VALUES
(2015, 82000, 83000, 78000, 83000)
, (2016, 85000, 85000, 80000, 81000)
, (2017, 92000, 81000, NULL , NULL )
;
n 比較Q1、Q2兩個欄位的Query:
SELECT
year
, q1
, q2
-- Q1和Q2的營收變化比較
, CASE
WHEN q1 < q2 THEN '+'
WHEN q1 = q2 THEN ' '
ELSE '-'
END AS judge_q1_q2
-- 計算Q1和Q2營收差
, q2 - q1 AS diff_q2_q1
-- 將Q1和Q2營收變化比較用1, 0, -1呈現
, SIGN(q2 - q1) AS sign_q2_q1
FROM quarterly_sales
ORDER BY year
;
SELECT
year
, q1
, q2
-- Q1和Q2的營收變化比較
, CASE
WHEN q1 < q2 THEN '+'
WHEN q1 = q2 THEN ' '
ELSE '-'
END AS judge_q1_q2
-- 計算Q1和Q2營收差
, q2 - q1 AS diff_q2_q1
-- 將Q1和Q2營收變化比較用1, 0, -1呈現
, SIGN(q2 - q1) AS sign_q2_q1
FROM quarterly_sales
ORDER BY year
;
n 找出年間最大/最小營收的季度:
SELECT
year
-- 取得Q1~Q4的最大營收
, greatest(q1, q2, q3, q4) AS greatest_sales
-- 取得Q1~Q4的最小營收
, least(q1, q2, q3, q4) AS least_sales
FROM quarterly_sales
ORDER BY year
;
SELECT
year
-- 取得Q1~Q4的最大營收
, greatest(q1, q2, q3, q4) AS greatest_sales
-- 取得Q1~Q4的最小營收
, least(q1, q2, q3, q4) AS least_sales
FROM quarterly_sales
ORDER BY year
;
n 單純計算年間平均季度營收的Query:
SELECT
year
, (q1 + q2 + q3 + q4) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
SELECT
year
, (q1 + q2 + q3 + q4) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
n 使用COALESCE將NULL置換成0後計算平均值:
SELECT
year
, (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
SELECT
year
, (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
n 只用數值不為NULL的欄位來計算平均值:
SELECT
year
, (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / (SIGN(COALESCE(q1, 0)) + SIGN(COALESCE(q2, 0)) + SIGN(COALESCE(q3, 0)) + SIGN(COALESCE(q4, 0))) AS average
FROM quarterly_sales
ORDER BY year
;
SELECT
year
, (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / (SIGN(COALESCE(q1, 0)) + SIGN(COALESCE(q2, 0)) + SIGN(COALESCE(q3, 0)) + SIGN(COALESCE(q4, 0))) AS average
FROM quarterly_sales
ORDER BY year
;
l 計算兩個數值的比率(除法、CAST函數、CASE語法、NULLIF函數):
n 範例資料表:
DROP TABLE IF EXISTS advertising_stats;
CREATE TABLE advertising_stats (
dt varchar(255)
, ad_id varchar(255)
, impressions integer
, clicks integer
);
INSERT INTO advertising_stats
VALUES
('2017-04-01', '001', 100000, 3000)
, ('2017-04-01', '002', 120000, 1200)
, ('2017-04-01', '003', 500000, 10000)
, ('2017-04-02', '001', 0, 0)
, ('2017-04-02', '002', 130000, 1400)
, ('2017-04-02', '003', 620000, 15000)
;
DROP TABLE IF EXISTS advertising_stats;
CREATE TABLE advertising_stats (
dt varchar(255)
, ad_id varchar(255)
, impressions integer
, clicks integer
);
INSERT INTO advertising_stats
VALUES
('2017-04-01', '001', 100000, 3000)
, ('2017-04-01', '002', 120000, 1200)
, ('2017-04-01', '003', 500000, 10000)
, ('2017-04-02', '001', 0, 0)
, ('2017-04-02', '002', 130000, 1400)
, ('2017-04-02', '003', 620000, 15000)
;
n 整數型別資料的除法運算:
SELECT
dt
, ad_id
-- 在PostgreSQL中, 整數型別的除法運算會捨去小數點以下的數值,所以要手動轉換資料型別
, CAST(clicks AS double precision) / impressions AS ctr
-- 在開頭用含小數的常數乘以clicks,資料型別就會自動進行轉換
, 100.0 * clicks / impressions AS ctr_as_percent
FROM advertising_stats
WHERE dt = '2017-04-01'
ORDER BY dt, ad_id
;
SELECT
dt
, ad_id
-- 在PostgreSQL中, 整數型別的除法運算會捨去小數點以下的數值,所以要手動轉換資料型別
, CAST(clicks AS double precision) / impressions AS ctr
-- 在開頭用含小數的常數乘以clicks,資料型別就會自動進行轉換
, 100.0 * clicks / impressions AS ctr_as_percent
FROM advertising_stats
WHERE dt = '2017-04-01'
ORDER BY dt, ad_id
;
n 避免除0並計算CTR:
SELECT
dt
, ad_id
-- 在CASE句設定分母不為0的條件,來避免除以0
, CASE
WHEN impressions > 0 THEN 100.0 * clicks / impressions
END AS ctr_as_percent_by_case
-- 當分母為0時就轉換成NULL,來避免除以0
, 100.0 * clicks / NULLIF(impressions, 0) AS ctr_as_percent_by_null
FROM advertising_stats
ORDER BY dt, ad_id
;
SELECT
dt
, ad_id
-- 在CASE句設定分母不為0的條件,來避免除以0
, CASE
WHEN impressions > 0 THEN 100.0 * clicks / impressions
END AS ctr_as_percent_by_case
-- 當分母為0時就轉換成NULL,來避免除以0
, 100.0 * clicks / NULLIF(impressions, 0) AS ctr_as_percent_by_null
FROM advertising_stats
ORDER BY dt, ad_id
;
l 計算兩個值的距離(absc函數、power函數、sqrt函數、point型別、<->運算子):
n 範例資料表:
DROP TABLE IF EXISTS location_1d;
CREATE TABLE location_1d (
x1 integer
, x2 integer
);
INSERT INTO location_1d
VALUES
( 5 , 10)
, (10 , 5)
, (-2 , 4)
, ( 3 , 3)
, ( 0 , 1)
;
DROP TABLE IF EXISTS location_2d;
CREATE TABLE location_2d (
x1 integer
, y1 integer
, x2 integer
, y2 integer
);
INSERT INTO location_2d
VALUES
(0, 0, 2, 2)
, (3, 5, 1, 2)
, (5, 3, 2, 1)
;
DROP TABLE IF EXISTS location_1d;
CREATE TABLE location_1d (
x1 integer
, x2 integer
);
INSERT INTO location_1d
VALUES
( 5 , 10)
, (10 , 5)
, (-2 , 4)
, ( 3 , 3)
, ( 0 , 1)
;
DROP TABLE IF EXISTS location_2d;
CREATE TABLE location_2d (
x1 integer
, y1 integer
, x2 integer
, y2 integer
);
INSERT INTO location_2d
VALUES
(0, 0, 2, 2)
, (3, 5, 1, 2)
, (5, 3, 2, 1)
;
n 計算一次元數據的絕對值和方均根:
SELECT
abs(x1 - x2) AS abs
, sqrt(power(x1 - x2, 2)) AS rms
FROM location_1d
;
SELECT
abs(x1 - x2) AS abs
, sqrt(power(x1 - x2, 2)) AS rms
FROM location_1d
;
n 對二次元數據計算均方根(歐幾里得距離):
SELECT
sqrt(power(x1 - x2, 2) + power(y1 - y2, 2)) AS dist
-- 在PostgreSQL可用point型別和距離運算子<->
-- point(x1, y1) <-> point(x2, y2) AS dist
FROM location_2d
;
SELECT
sqrt(power(x1 - x2, 2) + power(y1 - y2, 2)) AS dist
-- 在PostgreSQL可用point型別和距離運算子<->
-- point(x1, y1) <-> point(x2, y2) AS dist
FROM location_2d
;
l 計算日期、時間(interval型別、日期時間函數):
n 範例資料表:
DROP TABLE IF EXISTS mst_users_with_birthday;
CREATE TABLE mst_users_with_birthday (
user_id varchar(255)
, register_stamp varchar(255)
, birth_date varchar(255)
);
INSERT INTO mst_users_with_birthday
VALUES
('U001', '2016-02-28 10:00:00', '2000-02-29')
, ('U002', '2016-02-29 10:00:00', '2000-02-29')
, ('U003', '2016-03-01 10:00:00', '2000-02-29')
;
DROP TABLE IF EXISTS mst_users_with_birthday;
CREATE TABLE mst_users_with_birthday (
user_id varchar(255)
, register_stamp varchar(255)
, birth_date varchar(255)
);
INSERT INTO mst_users_with_birthday
VALUES
('U001', '2016-02-28 10:00:00', '2000-02-29')
, ('U002', '2016-02-29 10:00:00', '2000-02-29')
, ('U003', '2016-03-01 10:00:00', '2000-02-29')
;
n 計算未來或過去的日期時間:
SELECT
user_id
-- 可藉由interval型別的數據進行四則運算
, register_stamp::timestamp AS register_stamp
, register_stamp::timestamp + '1 hour'::interval AS after_1_hour
, register_stamp::timestamp - '30 minutes'::interval AS before_30_minutes
, register_stamp::date AS register_date
, (register_stamp::date + '1 day'::interval)::date AS after_1_day
, (register_stamp::date - '1 month'::interval)::date AS before_1_month
FROM mst_users_with_birthday
;
SELECT
user_id
-- 可藉由interval型別的數據進行四則運算
, register_stamp::timestamp AS register_stamp
, register_stamp::timestamp + '1 hour'::interval AS after_1_hour
, register_stamp::timestamp - '30 minutes'::interval AS before_30_minutes
, register_stamp::date AS register_date
, (register_stamp::date + '1 day'::interval)::date AS after_1_day
, (register_stamp::date - '1 month'::interval)::date AS before_1_month
FROM mst_users_with_birthday
;
n 計算兩個日期之間的天數差:
SELECT
user_id
-- 日期型別數據之間可進行減法
, CURRENT_DATE AS today
, register_stamp::date AS register_date
, CURRENT_DATE - register_stamp::date AS diff_days
FROM mst_users_with_birthday
;
SELECT
user_id
-- 日期型別數據之間可進行減法
, CURRENT_DATE AS today
, register_stamp::date AS register_date
, CURRENT_DATE - register_stamp::date AS diff_days
FROM mst_users_with_birthday
;
n 使用age函數計算年齡:
SELECT
user_id
-- 使用age函數和EXTRACT函數計算以年為單位的年齡
, CURRENT_DATE AS today
, register_stamp::date AS register_date
, birth_date::date AS birth_date
, EXTRACT(YEAR FROM age(birth_date::date)) AS current_age
, EXTRACT(YEAR FROM age(register_stamp::date, birth_date::date)) AS register_age
FROM mst_users_with_birthday
;
SELECT
user_id
-- 使用age函數和EXTRACT函數計算以年為單位的年齡
, CURRENT_DATE AS today
, register_stamp::date AS register_date
, birth_date::date AS birth_date
, EXTRACT(YEAR FROM age(birth_date::date)) AS current_age
, EXTRACT(YEAR FROM age(register_stamp::date, birth_date::date)) AS register_age
FROM mst_users_with_birthday
;
n 將日期以整數表示並計算年齡的Query:
-- 計算生日為2000年2月29日的人,在2016年2月28日時的年齡
SELECT floor((20160228 - 20000229) / 10000) AS age;
-- 計算生日為2000年2月29日的人,在2016年2月28日時的年齡
SELECT floor((20160228 - 20000229) / 10000) AS age;
n 從文字型別的生日計算註冊時間點和現在的年齡:
SELECT
user_id
, substring(register_stamp, 1, 10) AS register_date
, birth_date
-- 計算註冊時的年齡
, floor(
(CAST(replace(substring(register_stamp, 1, 10), '-', '') AS integer)
- CAST(replace(birth_date, '-', '') AS integer)
) / 10000
) AS register_age
-- 計算現在的年齡
, floor(
(CAST(replace(CAST(CURRENT_DATE AS text), '-', '') AS integer)
- CAST(replace(birth_date, '-', '') AS integer)
) / 10000
) AS current_age
FROM mst_users_with_birthday
;
SELECT
user_id
, substring(register_stamp, 1, 10) AS register_date
, birth_date
-- 計算註冊時的年齡
, floor(
(CAST(replace(substring(register_stamp, 1, 10), '-', '') AS integer)
- CAST(replace(birth_date, '-', '') AS integer)
) / 10000
) AS register_age
-- 計算現在的年齡
, floor(
(CAST(replace(CAST(CURRENT_DATE AS text), '-', '') AS integer)
- CAST(replace(birth_date, '-', '') AS integer)
) / 10000
) AS current_age
FROM mst_users_with_birthday
;
l 處理IP位置(inet型別、<<運算子、split_part函數、lpad函數):
n 比較inet型別的IP資料:
SELECT
CAST('127.0.0.1' AS inet) < CAST('127.0.0.2' AS inet) AS lt
, CAST('127.0.0.1' AS inet) > CAST('192.168.0.1' AS inet) AS gt
;
SELECT
CAST('127.0.0.1' AS inet) < CAST('127.0.0.2' AS inet) AS lt
, CAST('127.0.0.1' AS inet) > CAST('192.168.0.1' AS inet) AS gt
;
n 判別inet型別IP位置的範圍:
SELECT CAST('127.0.0.1' AS inet) << CAST('127.0.0.0/8' AS inet) AS is_contained;
SELECT CAST('127.0.0.1' AS inet) << CAST('127.0.0.0/8' AS inet) AS is_contained;
n 從IP位置擷取出四個octet的Query:
SELECT
ip
, CAST(split_part(ip, '.', 1) AS integer) AS ip_part_1
, CAST(split_part(ip, '.', 2) AS integer) AS ip_part_2
, CAST(split_part(ip, '.', 3) AS integer) AS ip_part_3
, CAST(split_part(ip, '.', 4) AS integer) AS ip_part_4
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
SELECT
ip
, CAST(split_part(ip, '.', 1) AS integer) AS ip_part_1
, CAST(split_part(ip, '.', 2) AS integer) AS ip_part_2
, CAST(split_part(ip, '.', 3) AS integer) AS ip_part_3
, CAST(split_part(ip, '.', 4) AS integer) AS ip_part_4
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
n 將IP位置以整數型別來呈現的Query:
SELECT
ip
, CAST(split_part(ip, '.', 1) AS integer) * 2^24
+ CAST(split_part(ip, '.', 2) AS integer) * 2^16
+ CAST(split_part(ip, '.', 3) AS integer) * 2^8
+ CAST(split_part(ip, '.', 4) AS integer) * 2^0
AS ip_integer
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
SELECT
ip
, CAST(split_part(ip, '.', 1) AS integer) * 2^24
+ CAST(split_part(ip, '.', 2) AS integer) * 2^16
+ CAST(split_part(ip, '.', 3) AS integer) * 2^8
+ CAST(split_part(ip, '.', 4) AS integer) * 2^0
AS ip_integer
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
n 將IP位置轉換成以0補位的字串:
SELECT
ip
, lpad(split_part(ip, '.', 1), 3, '0')
|| lpad(split_part(ip, '.', 2), 3, '0')
|| lpad(split_part(ip, '.', 3), 3, '0')
|| lpad(split_part(ip, '.', 4), 3, '0')
AS ip_padding
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
SELECT
ip
, lpad(split_part(ip, '.', 1), 3, '0')
|| lpad(split_part(ip, '.', 2), 3, '0')
|| lpad(split_part(ip, '.', 3), 3, '0')
|| lpad(split_part(ip, '.', 4), 3, '0')
AS ip_padding
FROM
-- 在PostgreSQL必須明確地指定資料型別
(SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
對單一資料表進行操作
l 掌握整體特徵(彙總函數、GROUP BY語法、window函數、OVER(...PARTITION BY~)語法):
n 範例資料表:
DROP TABLE IF EXISTS review;
CREATE TABLE review (
user_id varchar(255)
, product_id varchar(255)
, score numeric
);
INSERT INTO review
VALUES
('U001', 'A001', 4.0)
, ('U001', 'A002', 5.0)
, ('U001', 'A003', 5.0)
, ('U002', 'A001', 3.0)
, ('U002', 'A002', 3.0)
, ('U002', 'A003', 4.0)
, ('U003', 'A001', 5.0)
, ('U003', 'A002', 4.0)
, ('U003', 'A003', 4.0)
;
DROP TABLE IF EXISTS review;
CREATE TABLE review (
user_id varchar(255)
, product_id varchar(255)
, score numeric
);
INSERT INTO review
VALUES
('U001', 'A001', 4.0)
, ('U001', 'A002', 5.0)
, ('U001', 'A003', 5.0)
, ('U002', 'A001', 3.0)
, ('U002', 'A002', 3.0)
, ('U002', 'A003', 4.0)
, ('U003', 'A001', 5.0)
, ('U003', 'A002', 4.0)
, ('U003', 'A003', 4.0)
;
n 使用彙總函數計算資料表整體特徵:
SELECT
COUNT(*) AS total_count
, COUNT(DISTINCT user_id) AS user_count
, COUNT(DISTINCT product_id) AS product_count
, SUM(score) AS sum
, AVG(score) AS avg
, MAX(score) AS max
, MIN(score) AS min
FROM review
;
SELECT
COUNT(*) AS total_count
, COUNT(DISTINCT user_id) AS user_count
, COUNT(DISTINCT product_id) AS product_count
, SUM(score) AS sum
, AVG(score) AS avg
, MAX(score) AS max
, MIN(score) AS min
FROM review
;
n 將資料表依使用者不同進行分組彙總:
SELECT
user_id
, COUNT(*) AS total_count
, COUNT(DISTINCT product_id) AS product_count
, SUM(score) AS sum
, AVG(score) AS avg
, MAX(score) AS max
, MIN(score) AS min
FROM review
GROUP BY user_id
;
SELECT
user_id
, COUNT(*) AS total_count
, COUNT(DISTINCT product_id) AS product_count
, SUM(score) AS sum
, AVG(score) AS avg
, MAX(score) AS max
, MIN(score) AS min
FROM review
GROUP BY user_id
;
n 使用視窗函數同時處理彙總函數的結果以及原始值:
SELECT
user_id
, product_id
-- 單一評價分數
, score
-- 全體平均評價分數
, AVG(score) OVER() AS avg_score
-- 使用者平均評價分數
, AVG(score) OVER(PARTITION BY user_id) AS user_avg_score
-- 單一評價分數和使用者平均評價分數的差
, score - AVG(score) OVER(PARTITION BY user_id) AS user_avg_score_diff
FROM review
;
SELECT
user_id
, product_id
-- 單一評價分數
, score
-- 全體平均評價分數
, AVG(score) OVER() AS avg_score
-- 使用者平均評價分數
, AVG(score) OVER(PARTITION BY user_id) AS user_avg_score
-- 單一評價分數和使用者平均評價分數的差
, score - AVG(score) OVER(PARTITION BY user_id) AS user_avg_score_diff
FROM review
;
l 群組中排序(視窗函數、OVER(...ORDER BY~)句、OVER(...ROWS~)句):
n 範例資料表:
DROP TABLE IF EXISTS popular_products;
CREATE TABLE popular_products (
product_id varchar(255)
, category varchar(255)
, score numeric
);
INSERT INTO popular_products
VALUES
('A001', 'action', 94)
, ('A002', 'action', 81)
, ('A003', 'action', 78)
, ('A004', 'action', 64)
, ('D001', 'drama' , 90)
, ('D002', 'drama' , 82)
, ('D003', 'drama' , 78)
, ('D004', 'drama' , 58)
;
DROP TABLE IF EXISTS popular_products;
CREATE TABLE popular_products (
product_id varchar(255)
, category varchar(255)
, score numeric
);
INSERT INTO popular_products
VALUES
('A001', 'action', 94)
, ('A002', 'action', 81)
, ('A003', 'action', 78)
, ('A004', 'action', 64)
, ('D001', 'drama' , 90)
, ('D002', 'drama' , 82)
, ('D003', 'drama' , 78)
, ('D004', 'drama' , 58)
;
n 利用視窗函數的ORDER BY句,設定資料表內的順序:
SELECT
product_id
, score
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 允許同樣順位編號的排行方式,同順位之後會跳號
, RANK() OVER(ORDER BY score DESC) AS rank
-- 允許同順位,同順位之後不跳號的排行方式
, DENSE_RANK() OVER(ORDER BY score DESC) AS dense_rank
-- 取得目前資料行之前的資料行的值
, LAG(product_id) OVER(ORDER BY score DESC) AS lag1
, LAG(product_id, 2) OVER(ORDER BY score DESC) AS lag2
-- 取得目前資料行之後的資料行的值
, LEAD(product_id) OVER(ORDER BY score DESC) AS lead1
, LEAD(product_id, 2) OVER(ORDER BY score DESC) AS lead2
FROM popular_products
ORDER BY row
;
SELECT
product_id
, score
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 允許同樣順位編號的排行方式,同順位之後會跳號
, RANK() OVER(ORDER BY score DESC) AS rank
-- 允許同順位,同順位之後不跳號的排行方式
, DENSE_RANK() OVER(ORDER BY score DESC) AS dense_rank
-- 取得目前資料行之前的資料行的值
, LAG(product_id) OVER(ORDER BY score DESC) AS lag1
, LAG(product_id, 2) OVER(ORDER BY score DESC) AS lag2
-- 取得目前資料行之後的資料行的值
, LEAD(product_id) OVER(ORDER BY score DESC) AS lead1
, LEAD(product_id, 2) OVER(ORDER BY score DESC) AS lead2
FROM popular_products
ORDER BY row
;
n 搭配使用ORDER BY句與彙總函數的Query:
SELECT
product_id
, score
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 計算從排序前面的紀錄到目前資料行的累計加總分數(score)
, SUM(score) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_score
-- 計算目前資料行與前後各一筆資料行,共三筆資料行的平均分數
, AVG(score) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_avg
-- 取得排行最前面的商品ID
, FIRST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_value
-- 取得排行最後面的商品ID
, LAST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_value
FROM popular_products
ORDER BY row
;
SELECT
product_id
, score
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 計算從排序前面的紀錄到目前資料行的累計加總分數(score)
, SUM(score) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_score
-- 計算目前資料行與前後各一筆資料行,共三筆資料行的平均分數
, AVG(score) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_avg
-- 取得排行最前面的商品ID
, FIRST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_value
-- 取得排行最後面的商品ID
, LAST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_value
FROM popular_products
ORDER BY row
;
n 在每個指定的視窗框架內彙總全部的商品ID:
SELECT
product_id
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 以排序的最前端到最後端作為範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS whole_agg
-- 以排序的最前端到目前的資料行作為範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_agg
-- 將排序中的上一筆資料行到後一筆資料行作為處理範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_agg
FROM popular_products
WHERE category = 'action'
ORDER BY row
;
SELECT
product_id
-- 依score的降冪順序給予唯一的編號
, ROW_NUMBER() OVER(ORDER BY score DESC) AS row
-- 以排序的最前端到最後端作為範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS whole_agg
-- 以排序的最前端到目前的資料行作為範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_agg
-- 將排序中的上一筆資料行到後一筆資料行作為處理範圍,彙總商品ID
, array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_agg
FROM popular_products
WHERE category = 'action'
ORDER BY row
;
n 用視窗函數分組計算各類別內的順位:
SELECT
category
, product_id
, score
-- 各類別內依分數進行排序,並給予唯一的順位編號
, ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS row
-- 各類別內以允許同順位的方式進行排序,並給予順位編號
, RANK() OVER(PARTITION BY category ORDER BY score DESC) AS rank
-- 各類別內允許同順位,以同順位之後不跳號的方式進行排序
, DENSE_RANK() OVER(PARTITION BY category ORDER BY score DESC) AS dense_rank
FROM popular_products
ORDER BY category, row
;
SELECT
category
, product_id
, score
-- 各類別內依分數進行排序,並給予唯一的順位編號
, ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS row
-- 各類別內以允許同順位的方式進行排序,並給予順位編號
, RANK() OVER(PARTITION BY category ORDER BY score DESC) AS rank
-- 各類別內允許同順位,以同順位之後不跳號的方式進行排序
, DENSE_RANK() OVER(PARTITION BY category ORDER BY score DESC) AS dense_rank
FROM popular_products
ORDER BY category, row
;
n 擷取各類別中排行前兩名的商品:
SELECT *
FROM
-- 在子查詢中計算排名
(SELECT
category
, product_id
, score
-- 各類別內分別依分數排序,給予唯一的順位編號
, ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS rank
FROM popular_products
) AS popular_products_with_rank
-- 外側的查詢設定排名條件
WHERE rank <= 2
ORDER BY category, rank
;
SELECT *
FROM
-- 在子查詢中計算排名
(SELECT
category
, product_id
, score
-- 各類別內分別依分數排序,給予唯一的順位編號
, ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS rank
FROM popular_products
) AS popular_products_with_rank
-- 外側的查詢設定排名條件
WHERE rank <= 2
ORDER BY category, rank
;
n 取得各類別分組分數第一名的商品:
-- 使用DISTINCT句排除重複的項目
SELECT DISTINCT
category
-- 取得各類別分組第一名的商品ID
, FIRST_VALUE(product_id) OVER(PARTITION BY category ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS product_id
FROM popular_products
;
-- 使用DISTINCT句排除重複的項目
SELECT DISTINCT
category
-- 取得各類別分組第一名的商品ID
, FIRST_VALUE(product_id) OVER(PARTITION BY category ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS product_id
FROM popular_products
;
l 將縱向數據轉換成橫向數據(MAX(CASE~)語法、string_agg函數、listagg函數、collect_list函數):
n 範例資料表:
DROP TABLE IF EXISTS daily_kpi;
CREATE TABLE daily_kpi (
dt varchar(255)
, indicator varchar(255)
, val integer
);
INSERT INTO daily_kpi
VALUES
('2017-01-01', 'impressions', 1800)
, ('2017-01-01', 'sessions' , 500)
, ('2017-01-01', 'users' , 200)
, ('2017-01-02', 'impressions', 2000)
, ('2017-01-02', 'sessions' , 700)
, ('2017-01-02', 'users' , 250)
;
DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log (
purchase_id integer
, product_id varchar(255)
, price integer
);
INSERT INTO purchase_detail_log
VALUES
(100001, 'A001', 300)
, (100001, 'A002', 400)
, (100001, 'A003', 200)
, (100002, 'D001', 500)
, (100002, 'D002', 300)
, (100003, 'A001', 300)
;
DROP TABLE IF EXISTS daily_kpi;
CREATE TABLE daily_kpi (
dt varchar(255)
, indicator varchar(255)
, val integer
);
INSERT INTO daily_kpi
VALUES
('2017-01-01', 'impressions', 1800)
, ('2017-01-01', 'sessions' , 500)
, ('2017-01-01', 'users' , 200)
, ('2017-01-02', 'impressions', 2000)
, ('2017-01-02', 'sessions' , 700)
, ('2017-01-02', 'users' , 250)
;
DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log (
purchase_id integer
, product_id varchar(255)
, price integer
);
INSERT INTO purchase_detail_log
VALUES
(100001, 'A001', 300)
, (100001, 'A002', 400)
, (100001, 'A003', 200)
, (100002, 'D001', 500)
, (100002, 'D002', 300)
, (100003, 'A001', 300)
;
n 將以資料行形式儲存的指標數值轉換成以列的形式儲存:
SELECT
dt
, MAX(CASE WHEN indicator = 'impressions' THEN val END) AS impressions
, MAX(CASE WHEN indicator = 'sessions' THEN val END) AS sessions
, MAX(CASE WHEN indicator = 'users' THEN val END) AS users
FROM daily_kpi
GROUP BY dt
ORDER BY dt
;
SELECT
dt
, MAX(CASE WHEN indicator = 'impressions' THEN val END) AS impressions
, MAX(CASE WHEN indicator = 'sessions' THEN val END) AS sessions
, MAX(CASE WHEN indicator = 'users' THEN val END) AS users
FROM daily_kpi
GROUP BY dt
ORDER BY dt
;
n 將資料行彙總成含有區隔符號的字串:
SELECT
purchase_id
-- 將商品ID集中儲存於陣列,再轉換成以逗號為區隔符號的字串
, string_agg(product_id, ',') AS product_ids
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY purchase_id
ORDER BY purchase_id
;
SELECT
purchase_id
-- 將商品ID集中儲存於陣列,再轉換成以逗號為區隔符號的字串
, string_agg(product_id, ',') AS product_ids
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY purchase_id
ORDER BY purchase_id
;
l 將橫向數據轉換成縱向數據(unnest函數、explode函數、CROSS JOIN、LATERAL VIEW、regexp_split_to_table函數):
n 範例資料表:
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
year integer
, q1 integer
, q2 integer
, q3 integer
, q4 integer
);
INSERT INTO quarterly_sales
VALUES
(2015, 82000, 83000, 78000, 83000)
, (2016, 85000, 85000, 80000, 81000)
, (2017, 92000, 81000, NULL , NULL )
;
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
year integer
, q1 integer
, q2 integer
, q3 integer
, q4 integer
);
INSERT INTO quarterly_sales
VALUES
(2015, 82000, 83000, 78000, 83000)
, (2016, 85000, 85000, 80000, 81000)
, (2017, 92000, 81000, NULL , NULL )
;
n 使用連號資料表進行數據的行列轉換:
SELECT
q.year
-- 顯示從Q1到Q4的標籤名稱
, CASE
WHEN p.idx = 1 THEN 'q1'
WHEN p.idx = 2 THEN 'q2'
WHEN p.idx = 3 THEN 'q3'
WHEN p.idx = 4 THEN 'q4'
END AS quarter
-- 顯示從Q1到Q4的營收金額
, CASE
WHEN p.idx = 1 THEN q.q1
WHEN p.idx = 2 THEN q.q2
WHEN p.idx = 3 THEN q.q3
WHEN p.idx = 4 THEN q.q4
END AS sales
FROM
quarterly_sales AS q
CROSS JOIN
-- 建立和欲展開之數據數目相同的連號資料表
(SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
UNION ALL SELECT 4 AS idx
) AS p
;
SELECT
q.year
-- 顯示從Q1到Q4的標籤名稱
, CASE
WHEN p.idx = 1 THEN 'q1'
WHEN p.idx = 2 THEN 'q2'
WHEN p.idx = 3 THEN 'q3'
WHEN p.idx = 4 THEN 'q4'
END AS quarter
-- 顯示從Q1到Q4的營收金額
, CASE
WHEN p.idx = 1 THEN q.q1
WHEN p.idx = 2 THEN q.q2
WHEN p.idx = 3 THEN q.q3
WHEN p.idx = 4 THEN q.q4
END AS sales
FROM
quarterly_sales AS q
CROSS JOIN
-- 建立和欲展開之數據數目相同的連號資料表
(SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
UNION ALL SELECT 4 AS idx
) AS p
;
n 利用資料表函數將陣列展開至record:
SELECT unnest(ARRAY['A001', 'A002', 'A003']) AS product_id;
SELECT unnest(ARRAY['A001', 'A002', 'A003']) AS product_id;
n 範例資料表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
purchase_id integer
, product_ids varchar(255)
);
INSERT INTO purchase_log
VALUES
(100001, 'A001,A002,A003')
, (100002, 'D001,D002')
, (100003, 'A001')
;
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
purchase_id integer
, product_ids varchar(255)
);
INSERT INTO purchase_log
VALUES
(100001, 'A001,A002,A003')
, (100002, 'D001,D002')
, (100003, 'A001')
;
n 使用資料表函數展開含有逗點區隔符號的數據:
SELECT
purchase_id
, product_id
FROM
purchase_log AS p
-- string_to_array利用函數將字串轉換成陣列,再用unnest函數轉換成資料表
CROSS JOIN unnest(string_to_array(product_ids, ',')) AS product_id
;
SELECT
purchase_id
, product_id
FROM
purchase_log AS p
-- string_to_array利用函數將字串轉換成陣列,再用unnest函數轉換成資料表
CROSS JOIN unnest(string_to_array(product_ids, ',')) AS product_id
;
n 利用PostgreSQL展開含有逗點區隔符號的數據:
SELECT
purchase_id
-- 同時切割含有逗點區隔符號的字串並展開至資料行(record)
, regexp_split_to_table(product_ids, ',') AS product_id
FROM purchase_log
;
SELECT
purchase_id
-- 同時切割含有逗點區隔符號的字串並展開至資料行(record)
, regexp_split_to_table(product_ids, ',') AS product_id
FROM purchase_log
;
n 建立連號資料表:
SELECT *
FROM (
SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
) AS pivot
;
SELECT *
FROM (
SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
) AS pivot
;
n split_part的使用範例:
SELECT
split_part('A001,A002,A003', ',', 1) AS part_1
, split_part('A001,A002,A003', ',', 2) AS part_2
, split_part('A001,A002,A003', ',', 3) AS part_3
;
SELECT
split_part('A001,A002,A003', ',', 1) AS part_1
, split_part('A001,A002,A003', ',', 2) AS part_2
, split_part('A001,A002,A003', ',', 3) AS part_3
;
n 用字串文字數的差值計算商品數:
SELECT
purchase_id
, product_ids
-- 從商品ID的字串中去除逗號,並利用文字數差值,計算商品數目
, 1 + char_length(product_ids) - char_length(replace(product_ids, ',', '')) AS product_num
FROM purchase_log
;
SELECT
purchase_id
, product_ids
-- 從商品ID的字串中去除逗號,並利用文字數差值,計算商品數目
, 1 + char_length(product_ids) - char_length(replace(product_ids, ',', '')) AS product_num
FROM purchase_log
;
n 使用連號資料表將字串展開至資料行:
SELECT
l.purchase_id
, l.product_ids
-- 依商品數目給予聯結的編號
, p.idx
-- 藉由逗號區隔符號分割字串,取出第idx的元素
, split_part(l.product_ids, ',', p.idx) AS product_id
FROM
purchase_log AS l
JOIN
(SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
) AS p
-- 只和連號資料表id在商品數目以下的部分進行聯結
ON p.idx <= (1 + char_length(l.product_ids) - char_length(replace(l.product_ids, ',', '')))
;
SELECT
l.purchase_id
, l.product_ids
-- 依商品數目給予聯結的編號
, p.idx
-- 藉由逗號區隔符號分割字串,取出第idx的元素
, split_part(l.product_ids, ',', p.idx) AS product_id
FROM
purchase_log AS l
JOIN
(SELECT 1 AS idx
UNION ALL SELECT 2 AS idx
UNION ALL SELECT 3 AS idx
) AS p
-- 只和連號資料表id在商品數目以下的部分進行聯結
ON p.idx <= (1 + char_length(l.product_ids) - char_length(replace(l.product_ids, ',', '')))
;
對複數資料表進行操作
l 縱向聯結複數資料表(UNION ALL語法):
n 範例資料表:
DROP TABLE IF EXISTS app1_mst_users;
CREATE TABLE app1_mst_users (
user_id varchar(255)
, name varchar(255)
, email varchar(255)
);
INSERT INTO app1_mst_users
VALUES
('U001', 'Sato' , 'sato@example.com' )
, ('U002', 'Suzuki', 'suzuki@example.com')
;
DROP TABLE IF EXISTS app2_mst_users;
CREATE TABLE app2_mst_users (
user_id varchar(255)
, name varchar(255)
, phone varchar(255)
);
INSERT INTO app2_mst_users
VALUES
('U001', 'Ito' , '080-xxxx-xxxx')
, ('U002', 'Tanaka', '070-xxxx-xxxx')
;
DROP TABLE IF EXISTS app1_mst_users;
CREATE TABLE app1_mst_users (
user_id varchar(255)
, name varchar(255)
, email varchar(255)
);
INSERT INTO app1_mst_users
VALUES
('U001', 'Sato' , 'sato@example.com' )
, ('U002', 'Suzuki', 'suzuki@example.com')
;
DROP TABLE IF EXISTS app2_mst_users;
CREATE TABLE app2_mst_users (
user_id varchar(255)
, name varchar(255)
, phone varchar(255)
);
INSERT INTO app2_mst_users
VALUES
('U001', 'Ito' , '080-xxxx-xxxx')
, ('U002', 'Tanaka', '070-xxxx-xxxx')
;
n 使用UNION ALL句縱向聯結資料表:
SELECT
'app1' AS app_name
, user_id
, name
, email
FROM app1_mst_users
UNION ALL
SELECT
'app2' AS app_name
, user_id
, name
, NULL AS email
FROM app2_mst_users
;
SELECT
'app1' AS app_name
, user_id
, name
FROM app1_mst_users
UNION ALL
SELECT
'app2' AS app_name
, user_id
, name
, NULL AS email
FROM app2_mst_users
;
l 橫向聯結複數資料表(LEFT JOIN、關聯子查詢):
n 範例資料表:
DROP TABLE IF EXISTS mst_categories;
CREATE TABLE mst_categories (
category_id integer
, name varchar(255)
);
INSERT INTO mst_categories
VALUES
(1, 'dvd' )
, (2, 'cd' )
, (3, 'book')
;
DROP TABLE IF EXISTS category_sales;
CREATE TABLE category_sales (
category_id integer
, sales integer
);
INSERT INTO category_sales
VALUES
(1, 850000)
, (2, 500000)
;
DROP TABLE IF EXISTS product_sale_ranking;
CREATE TABLE product_sale_ranking (
category_id integer
, rank integer
, product_id varchar(255)
, sales integer
);
INSERT INTO product_sale_ranking
VALUES
(1, 1, 'D001', 50000)
, (1, 2, 'D002', 20000)
, (1, 3, 'D003', 10000)
, (2, 1, 'C001', 30000)
, (2, 2, 'C002', 20000)
, (2, 3, 'C003', 10000)
;
DROP TABLE IF EXISTS mst_categories;
CREATE TABLE mst_categories (
category_id integer
, name varchar(255)
);
INSERT INTO mst_categories
VALUES
(1, 'dvd' )
, (2, 'cd' )
, (3, 'book')
;
DROP TABLE IF EXISTS category_sales;
CREATE TABLE category_sales (
category_id integer
, sales integer
);
INSERT INTO category_sales
VALUES
(1, 850000)
, (2, 500000)
;
DROP TABLE IF EXISTS product_sale_ranking;
CREATE TABLE product_sale_ranking (
category_id integer
, rank integer
, product_id varchar(255)
, sales integer
);
INSERT INTO product_sale_ranking
VALUES
(1, 1, 'D001', 50000)
, (1, 2, 'D002', 20000)
, (1, 3, 'D003', 10000)
, (2, 1, 'C001', 30000)
, (2, 2, 'C002', 20000)
, (2, 3, 'C003', 10000)
;
n 橫向聯結複數資料表:
SELECT
m.category_id
, m.name
, s.sales
, r.product_id AS sale_product
FROM
mst_categories AS m
JOIN
-- 聯結各類別銷售金額
category_sales AS s
ON m.category_id = s.category_id
JOIN
-- 聯結各類別商品
product_sale_ranking AS r
ON m.category_id = r.category_id
;
SELECT
m.category_id
, m.name
, s.sales
, r.product_id AS sale_product
FROM
mst_categories AS m
JOIN
-- 聯結各類別銷售金額
category_sales AS s
ON m.category_id = s.category_id
JOIN
-- 聯結各類別商品
product_sale_ranking AS r
ON m.category_id = r.category_id
;
n 不變動資料表行數並進行複數資料表的橫向聯結:
SELECT
m.category_id
, m.name
, s.sales
, r.product_id AS top_sale_product
FROM
mst_categories AS m
-- 使用左外部聯結保留無法建立聯結的record
LEFT JOIN
-- 聯結各類別銷售金額
category_sales AS s
ON m.category_id = s.category_id
-- 使用左外部聯結保留無法建立聯結的record
LEFT JOIN
-- 聯結各類別銷售第一名的商品
product_sale_ranking AS r
ON m.category_id = r.category_id
AND r.rank = 1
;
SELECT
m.category_id
, m.name
, s.sales
, r.product_id AS top_sale_product
FROM
mst_categories AS m
-- 使用左外部聯結保留無法建立聯結的record
LEFT JOIN
-- 聯結各類別銷售金額
category_sales AS s
ON m.category_id = s.category_id
-- 使用左外部聯結保留無法建立聯結的record
LEFT JOIN
-- 聯結各類別銷售第一名的商品
product_sale_ranking AS r
ON m.category_id = r.category_id
AND r.rank = 1
;
n 使用關聯子查詢橫向聯結複數資料表:
SELECT
m.category_id
, m.name
-- 用關聯子查詢取得各類別的銷售金額
, (SELECT s.sales
FROM category_sales AS s
WHERE m.category_id = s.category_id
) AS sales
-- 用關聯子查詢取得銷售第一的商品(不需使用rank欄位設定條件)
, (SELECT r.product_id
FROM product_sale_ranking AS r
WHERE m.category_id = r.category_id
ORDER BY sales DESC
LIMIT 1
) AS top_sale_product
FROM mst_categories AS m
;
SELECT
m.category_id
, m.name
-- 用關聯子查詢取得各類別的銷售金額
, (SELECT s.sales
FROM category_sales AS s
WHERE m.category_id = s.category_id
) AS sales
-- 用關聯子查詢取得銷售第一的商品(不需使用rank欄位設定條件)
, (SELECT r.product_id
FROM product_sale_ranking AS r
WHERE m.category_id = r.category_id
ORDER BY sales DESC
LIMIT 1
) AS top_sale_product
FROM mst_categories AS m
;
l 用0和1表示條件旗標(CASE句、SIGN函數):
n 範例資料表:
DROP TABLE IF EXISTS mst_users_with_card_number;
CREATE TABLE mst_users_with_card_number (
user_id varchar(255)
, card_number varchar(255)
);
INSERT INTO mst_users_with_card_number
VALUES
('U001', '1234-xxxx-xxxx-xxxx')
, ('U002', NULL )
, ('U003', '5678-xxxx-xxxx-xxxx')
;
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
purchase_id integer
, user_id varchar(255)
, amount integer
, stamp varchar(255)
);
INSERT INTO purchase_log
VALUES
(10001, 'U001', 200, '2017-01-30 10:00:00')
, (10002, 'U001', 500, '2017-02-10 10:00:00')
, (10003, 'U001', 200, '2017-02-12 10:00:00')
, (10004, 'U002', 800, '2017-03-01 10:00:00')
, (10005, 'U002', 400, '2017-03-02 10:00:00')
;
DROP TABLE IF EXISTS mst_users_with_card_number;
CREATE TABLE mst_users_with_card_number (
user_id varchar(255)
, card_number varchar(255)
);
INSERT INTO mst_users_with_card_number
VALUES
('U001', '1234-xxxx-xxxx-xxxx')
, ('U002', NULL )
, ('U003', '5678-xxxx-xxxx-xxxx')
;
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
purchase_id integer
, user_id varchar(255)
, amount integer
, stamp varchar(255)
);
INSERT INTO purchase_log
VALUES
(10001, 'U001', 200, '2017-01-30 10:00:00')
, (10002, 'U001', 500, '2017-02-10 10:00:00')
, (10003, 'U001', 200, '2017-02-12 10:00:00')
, (10004, 'U002', 800, '2017-03-01 10:00:00')
, (10005, 'U002', 400, '2017-03-02 10:00:00')
;
n 用0與1的條件旗標來表示信用卡登錄與購買紀錄的狀態:
SELECT
m.user_id
, m.card_number
, COUNT(p.user_id) AS purchase_count
-- 有登記信用卡號碼時用1,沒有登記時用0來表示
, CASE WHEN m.card_number IS NOT NULL THEN 1 ELSE 0 END AS has_card
-- 有購買紀錄時為1,沒有時為0
, SIGN(COUNT(p.user_id)) AS has_purchased
FROM
mst_users_with_card_number AS m
LEFT JOIN
purchase_log AS p
ON m.user_id = p.user_id
GROUP BY
m.user_id, m.card_number
;
SELECT
m.user_id
, m.card_number
, COUNT(p.user_id) AS purchase_count
-- 有登記信用卡號碼時用1,沒有登記時用0來表示
, CASE WHEN m.card_number IS NOT NULL THEN 1 ELSE 0 END AS has_card
-- 有購買紀錄時為1,沒有時為0
, SIGN(COUNT(p.user_id)) AS has_purchased
FROM
mst_users_with_card_number AS m
LEFT JOIN
purchase_log AS p
ON m.user_id = p.user_id
GROUP BY
m.user_id, m.card_number
;
l 將資料表命名後重複利用(Common Table Expression, CTE(WITH句)):
n 範例資料表:
DROP TABLE IF EXISTS product_sales;
CREATE TABLE product_sales (
category_name varchar(255)
, product_id varchar(255)
, sales integer
);
INSERT INTO product_sales
VALUES
('dvd' , 'D001', 50000)
, ('dvd' , 'D002', 20000)
, ('dvd' , 'D003', 10000)
, ('cd' , 'C001', 30000)
, ('cd' , 'C002', 20000)
, ('cd' , 'C003', 10000)
, ('book', 'B001', 20000)
, ('book', 'B002', 15000)
, ('book', 'B003', 10000)
, ('book', 'B004', 5000)
;
DROP TABLE IF EXISTS product_sales;
CREATE TABLE product_sales (
category_name varchar(255)
, product_id varchar(255)
, sales integer
);
INSERT INTO product_sales
VALUES
('dvd' , 'D001', 50000)
, ('dvd' , 'D002', 20000)
, ('dvd' , 'D003', 10000)
, ('cd' , 'C001', 30000)
, ('cd' , 'C002', 20000)
, ('cd' , 'C003', 10000)
, ('book', 'B001', 20000)
, ('book', 'B002', 15000)
, ('book', 'B003', 10000)
, ('book', 'B004', 5000)
;
n 綜合顯示各類別的排行榜:
WITH
product_sale_ranking AS (
SELECT
category_name
, product_id
, sales
, ROW_NUMBER() OVER(PARTITION BY category_name ORDER BY sales DESC) AS rank
FROM product_sales
)
, mst_rank AS (
SELECT DISTINCT rank
FROM product_sale_ranking
)
SELECT
m.rank
, r1.product_id AS dvd
, r1.sales AS dvd_sales
, r2.product_id AS cd
, r2.sales AS cd_sales
, r3.product_id AS book
, r3.sales AS book_sales
FROM
mst_rank AS m
LEFT JOIN
product_sale_ranking AS r1
ON m.rank = r1.rank
AND r1.category_name = 'dvd'
LEFT JOIN
product_sale_ranking AS r2
ON m.rank = r2.rank
AND r2.category_name = 'cd'
LEFT JOIN
product_sale_ranking AS r3
ON m.rank = r3.rank
AND r3.category_name = 'book'
ORDER BY m.rank
;
WITH
product_sale_ranking AS (
SELECT
category_name
, product_id
, sales
, ROW_NUMBER() OVER(PARTITION BY category_name ORDER BY sales DESC) AS rank
FROM product_sales
)
, mst_rank AS (
SELECT DISTINCT rank
FROM product_sale_ranking
)
SELECT
m.rank
, r1.product_id AS dvd
, r1.sales AS dvd_sales
, r2.product_id AS cd
, r2.sales AS cd_sales
, r3.product_id AS book
, r3.sales AS book_sales
FROM
mst_rank AS m
LEFT JOIN
product_sale_ranking AS r1
ON m.rank = r1.rank
AND r1.category_name = 'dvd'
LEFT JOIN
product_sale_ranking AS r2
ON m.rank = r2.rank
AND r2.category_name = 'cd'
LEFT JOIN
product_sale_ranking AS r3
ON m.rank = r3.rank
AND r3.category_name = 'book'
ORDER BY m.rank
;
l 建立虛擬資料表(UNION ALL句、VALUE句、explode函數、generate_series函數):
n 範例資料表:
DROP TABLE IF EXISTS users;
CREATE TABLE users(
user_id varchar(255)
, regist_device integer
);
INSERT INTO users
VALUES
('U001', 1)
, ('U002', 2)
, ('U003', 3)
;
DROP TABLE IF EXISTS users;
CREATE TABLE users(
user_id varchar(255)
, regist_device integer
);
INSERT INTO users
VALUES
('U001', 1)
, ('U002', 2)
, ('U003', 3)
;
n 使用虛擬資料表將編碼值轉換成標籤:
WITH
mst_devices AS (
SELECT 1 AS device_id, 'PC' AS device_name
UNION ALL SELECT 2 AS device_id, 'SP' AS device_name
UNION ALL SELECT 3 AS device_id, 'APP' AS device_name
)
SELECT
u.user_id
, d.device_name
FROM
users AS u
LEFT JOIN
mst_devices AS d
ON u.regist_device = d.device_id
;
WITH
mst_devices AS (
SELECT 1 AS device_id, 'PC' AS device_name
UNION ALL SELECT 2 AS device_id, 'SP' AS device_name
UNION ALL SELECT 3 AS device_id, 'APP' AS device_name
)
SELECT
u.user_id
, d.device_name
FROM
users AS u
LEFT JOIN
mst_devices AS d
ON u.regist_device = d.device_id
;
n 用VALUES句動態建立資料表:
WITH
mst_devices(device_id, device_name) AS (
VALUES
(1, 'PC')
, (2, 'SP')
, (3, 'APP')
)
SELECT *
FROM mst_devices
;
WITH
mst_devices(device_id, device_name) AS (
VALUES
(1, 'PC')
, (2, 'SP')
, (3, 'APP')
)
SELECT *
FROM mst_devices
;
n 建立有連號的虛擬資料表:
WITH
series AS (
-- 建立1到5的連續編號
SELECT generate_series(1, 5) AS idx
)
SELECT *
FROM series
;
WITH
series AS (
-- 建立1到5的連續編號
SELECT generate_series(1, 5) AS idx
)
SELECT *
FROM series
;
二、營收狀態相關數據的彙總、分析
沿著時間軸蒐集數據
l 彙總每日營收(GROUP BY句、SUM函數、AVG函數)(分析:加總、平均):
n 範例資歷表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, purchase_amount integer
);
INSERT INTO purchase_log
VALUES
('2014-01-01', 1, 'rhwpvvitou', 13900)
, ('2014-01-01', 2, 'hqnwoamzic', 10616)
, ('2014-01-02', 3, 'tzlmqryunr', 21156)
, ('2014-01-02', 4, 'wkmqqwbyai', 14893)
, ('2014-01-03', 5, 'ciecbedwbq', 13054)
, ('2014-01-03', 6, 'svgnbqsagx', 24384)
, ('2014-01-03', 7, 'dfgqftdocu', 15591)
, ('2014-01-04', 8, 'sbgqlzkvyn', 3025)
, ('2014-01-04', 9, 'lbedmngbol', 24215)
, ('2014-01-04', 10, 'itlvssbsgx', 2059)
, ('2014-01-05', 11, 'jqcmmguhik', 4235)
, ('2014-01-05', 12, 'jgotcrfeyn', 28013)
, ('2014-01-05', 13, 'pgeojzoshx', 16008)
, ('2014-01-06', 14, 'msjberhxnx', 1980)
, ('2014-01-06', 15, 'tlhbolohte', 23494)
, ('2014-01-06', 16, 'gbchhkcotf', 3966)
, ('2014-01-07', 17, 'zfmbpvpzvu', 28159)
, ('2014-01-07', 18, 'yauwzpaxtx', 8715)
, ('2014-01-07', 19, 'uyqboqfgex', 10805)
, ('2014-01-08', 20, 'hiqdkrzcpq', 3462)
, ('2014-01-08', 21, 'zosbvlylpv', 13999)
, ('2014-01-08', 22, 'bwfbchzgnl', 2299)
, ('2014-01-09', 23, 'zzgauelgrt', 16475)
, ('2014-01-09', 24, 'qrzfcwecge', 6469)
, ('2014-01-10', 25, 'njbpsrvvcq', 16584)
, ('2014-01-10', 26, 'cyxfgumkst', 11339)
;
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, purchase_amount integer
);
INSERT INTO purchase_log
VALUES
('2014-01-01', 1, 'rhwpvvitou', 13900)
, ('2014-01-01', 2, 'hqnwoamzic', 10616)
, ('2014-01-02', 3, 'tzlmqryunr', 21156)
, ('2014-01-02', 4, 'wkmqqwbyai', 14893)
, ('2014-01-03', 5, 'ciecbedwbq', 13054)
, ('2014-01-03', 6, 'svgnbqsagx', 24384)
, ('2014-01-03', 7, 'dfgqftdocu', 15591)
, ('2014-01-04', 8, 'sbgqlzkvyn', 3025)
, ('2014-01-04', 9, 'lbedmngbol', 24215)
, ('2014-01-04', 10, 'itlvssbsgx', 2059)
, ('2014-01-05', 11, 'jqcmmguhik', 4235)
, ('2014-01-05', 12, 'jgotcrfeyn', 28013)
, ('2014-01-05', 13, 'pgeojzoshx', 16008)
, ('2014-01-06', 14, 'msjberhxnx', 1980)
, ('2014-01-06', 15, 'tlhbolohte', 23494)
, ('2014-01-06', 16, 'gbchhkcotf', 3966)
, ('2014-01-07', 17, 'zfmbpvpzvu', 28159)
, ('2014-01-07', 18, 'yauwzpaxtx', 8715)
, ('2014-01-07', 19, 'uyqboqfgex', 10805)
, ('2014-01-08', 20, 'hiqdkrzcpq', 3462)
, ('2014-01-08', 21, 'zosbvlylpv', 13999)
, ('2014-01-08', 22, 'bwfbchzgnl', 2299)
, ('2014-01-09', 23, 'zzgauelgrt', 16475)
, ('2014-01-09', 24, 'qrzfcwecge', 6469)
, ('2014-01-10', 25, 'njbpsrvvcq', 16584)
, ('2014-01-10', 26, 'cyxfgumkst', 11339)
;
n 彙總每日營收和平均購買金額的Query程式:
SELECT
dt
, COUNT(*) AS purchase_count
, SUM(purchase_amount) AS total_amount
, AVG(purchase_amount) AS avg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
SELECT
dt
, COUNT(*) AS purchase_count
, SUM(purchase_amount) AS total_amount
, AVG(purchase_amount) AS avg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
l 使用移動平均觀察每日的變動(OVER(ORDER BY~))(分析:移動平均):
n 範例資歷表:
同上例
同上例
n 彙總每日營收與七日移動平均的Query:
SELECT
dt
, SUM(purchase_amount) AS total_amount
-- 計算過去最多七日間的平均營收金額
, AVG(SUM(purchase_amount)) OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS seven_day_avg
-- 嚴謹計算過去七日間的平均營收金額
, CASE
WHEN
7 = COUNT(*)
OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
THEN
AVG(SUM(purchase_amount))
OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
END AS seven_day_avg_strict
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
SELECT
dt
, SUM(purchase_amount) AS total_amount
-- 計算過去最多七日間的平均營收金額
, AVG(SUM(purchase_amount)) OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS seven_day_avg
-- 嚴謹計算過去七日間的平均營收金額
, CASE
WHEN
7 = COUNT(*)
OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
THEN
AVG(SUM(purchase_amount))
OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
END AS seven_day_avg_strict
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
l 計算當月累計營收(OVER(PARTITION BY~ ORDER BY~))(分析:累計):
n 範例資歷表:
同上例
同上例
n 彙總每日營收與當月累計營收的Query:
SELECT
dt
-- 使用substring或substr擷取「年-月」的部份
, substring(dt, 1, 7) AS year_month
, SUM(purchase_amount) AS total_amount
, SUM(SUM(purchase_amount)) OVER(PARTITION BY substring(dt, 1, 7) ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
SELECT
dt
-- 使用substring或substr擷取「年-月」的部份
, substring(dt, 1, 7) AS year_month
, SUM(purchase_amount) AS total_amount
, SUM(SUM(purchase_amount)) OVER(PARTITION BY substring(dt, 1, 7) ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
n 在daily_purchase資料表計算當月累計營收的Query:
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
SELECT
dt
, concat(year, '-', month) AS year_month
, purchase_amount
, SUM(purchase_amount) OVER(PARTITION BY year, month ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM daily_purchase
ORDER BY dt
;
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
SELECT
dt
, concat(year, '-', month) AS year_month
, purchase_amount
, SUM(purchase_amount) OVER(PARTITION BY year, month ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM daily_purchase
ORDER BY dt
;
l 計算每月營收的年成長率(SUM(CASE~END))(分析:年成長率):
n 範例資歷表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, purchase_amount integer
);
INSERT INTO purchase_log
VALUES
('2014-01-01', 1, 'rhwpvvitou', 13900)
, ('2014-02-08', 95, 'chtanrqtzj', 28469)
, ('2014-03-09', 168, 'bcqgtwxdgq', 18899)
, ('2014-04-11', 250, 'kdjyplrxtk', 12394)
, ('2014-05-11', 325, 'pgnjnnapsc', 2282)
, ('2014-06-12', 400, 'iztgctnnlh', 10180)
, ('2014-07-11', 475, 'eucjmxvjkj', 4027)
, ('2014-08-10', 550, 'fqwvlvndef', 6243)
, ('2014-09-10', 625, 'mhwhxfxrxq', 3832)
, ('2014-10-11', 700, 'wyrgiyvaia', 6716)
, ('2014-11-10', 775, 'cwpdvmhhwh', 16444)
, ('2014-12-10', 850, 'eqeaqvixkf', 29199)
, ('2015-01-09', 925, 'efmclayfnr', 22111)
, ('2015-02-10', 1000, 'qnebafrkco', 11965)
, ('2015-03-12', 1075, 'gsvqniykgx', 20215)
, ('2015-04-12', 1150, 'ayzvjvnocm', 11792)
, ('2015-05-13', 1225, 'knhevkibbp', 18087)
, ('2015-06-10', 1291, 'wxhxmzqxuw', 18859)
, ('2015-07-10', 1366, 'krrcpumtzb', 14919)
, ('2015-08-08', 1441, 'lpglkecvsl', 12906)
, ('2015-09-07', 1516, 'mgtlsfgfbj', 5696)
, ('2015-10-07', 1591, 'trgjscaajt', 13398)
, ('2015-11-06', 1666, 'ccfbjyeqrb', 6213)
, ('2015-12-05', 1741, 'onooskbtzp', 26024)
;
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, purchase_amount integer
);
INSERT INTO purchase_log
VALUES
('2014-01-01', 1, 'rhwpvvitou', 13900)
, ('2014-02-08', 95, 'chtanrqtzj', 28469)
, ('2014-03-09', 168, 'bcqgtwxdgq', 18899)
, ('2014-04-11', 250, 'kdjyplrxtk', 12394)
, ('2014-05-11', 325, 'pgnjnnapsc', 2282)
, ('2014-06-12', 400, 'iztgctnnlh', 10180)
, ('2014-07-11', 475, 'eucjmxvjkj', 4027)
, ('2014-08-10', 550, 'fqwvlvndef', 6243)
, ('2014-09-10', 625, 'mhwhxfxrxq', 3832)
, ('2014-10-11', 700, 'wyrgiyvaia', 6716)
, ('2014-11-10', 775, 'cwpdvmhhwh', 16444)
, ('2014-12-10', 850, 'eqeaqvixkf', 29199)
, ('2015-01-09', 925, 'efmclayfnr', 22111)
, ('2015-02-10', 1000, 'qnebafrkco', 11965)
, ('2015-03-12', 1075, 'gsvqniykgx', 20215)
, ('2015-04-12', 1150, 'ayzvjvnocm', 11792)
, ('2015-05-13', 1225, 'knhevkibbp', 18087)
, ('2015-06-10', 1291, 'wxhxmzqxuw', 18859)
, ('2015-07-10', 1366, 'krrcpumtzb', 14919)
, ('2015-08-08', 1441, 'lpglkecvsl', 12906)
, ('2015-09-07', 1516, 'mgtlsfgfbj', 5696)
, ('2015-10-07', 1591, 'trgjscaajt', 13398)
, ('2015-11-06', 1666, 'ccfbjyeqrb', 6213)
, ('2015-12-05', 1741, 'onooskbtzp', 26024)
;
n 計算每月營收和年成長率:
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
SELECT
month
, SUM(CASE year WHEN '2014' THEN purchase_amount END) AS amount_2014
, SUM(CASE year WHEN '2015' THEN purchase_amount END) AS amount_2015
, 100.0 * SUM(CASE year WHEN '2015' THEN purchase_amount END) / SUM(CASE year WHEN '2014' THEN purchase_amount END) AS rate
FROM daily_purchase
GROUP BY month
ORDER BY month
;
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
SELECT
month
, SUM(CASE year WHEN '2014' THEN purchase_amount END) AS amount_2014
, SUM(CASE year WHEN '2015' THEN purchase_amount END) AS amount_2015
, 100.0 * SUM(CASE year WHEN '2015' THEN purchase_amount END) / SUM(CASE year WHEN '2014' THEN purchase_amount END) AS rate
FROM daily_purchase
GROUP BY month
ORDER BY month
;
l 利用Z圖表確認業績的變化(SUM(CASE~END) OVER(ORDER BY~))(分析:Z圖表,建立Z圖表需要每月營收、累計營收、移動年營收三個指標):
n 範例資歷表:
同上例
同上例
n 建立2015年營收的Z圖表:
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
, monthly_amount AS (
-- 彙總每月營收
SELECT
year
, month
, SUM(purchase_amount) AS amount
FROM daily_purchase
GROUP BY year, month
)
, calc_index AS (
SELECT
year
, month
, amount
-- 計算2015年的累計營收
, SUM(CASE WHEN year = '2015' THEN amount END) OVER(ORDER BY year, month ROWS UNBOUNDED PRECEDING) AS agg_amount
-- 計算從當月開始到11個月之前的合計營收(移動年營收)
, SUM(amount) OVER(ORDER BY year, month ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS year_avg_amount
FROM monthly_amount
ORDER BY year, month
)
-- 最後只擷取出2015年的數據
SELECT
concat(year, '-', month) AS year_month
, amount
, agg_amount
, year_avg_amount
FROM calc_index
WHERE year = '2015'
ORDER BY year_month
;
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
, monthly_amount AS (
-- 彙總每月營收
SELECT
year
, month
, SUM(purchase_amount) AS amount
FROM daily_purchase
GROUP BY year, month
)
, calc_index AS (
SELECT
year
, month
, amount
-- 計算2015年的累計營收
, SUM(CASE WHEN year = '2015' THEN amount END) OVER(ORDER BY year, month ROWS UNBOUNDED PRECEDING) AS agg_amount
-- 計算從當月開始到11個月之前的合計營收(移動年營收)
, SUM(amount) OVER(ORDER BY year, month ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS year_avg_amount
FROM monthly_amount
ORDER BY year, month
)
-- 最後只擷取出2015年的數據
SELECT
concat(year, '-', month) AS year_month
, amount
, agg_amount
, year_avg_amount
FROM calc_index
WHERE year = '2015'
ORDER BY year_month
;
l 掌握影響營收的重要因素:
n 範例資歷表:
同上例
同上例
n 彙總營收相關指標的Query:
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
, monthly_purchase AS (
SELECT
year
, month
, AVG(purchase_amount) AS avg_amount
, SUM(purchase_amount) AS monthly
FROM daily_purchase
GROUP BY year, month
)
SELECT
concat(year, '-', month) AS year_month
, avg_amount
, monthly
, SUM(monthly) OVER(PARTITION BY year ORDER BY month ROWS UNBOUNDED PRECEDING) AS agg_amount
-- 取得12個月前的營收
, LAG(monthly, 12) OVER(ORDER BY year, month) AS last_year
-- 計算當月營收和12個月前營收的比值
, 100.0 * monthly / LAG(monthly, 12) OVER(ORDER BY year, month) AS rate
FROM monthly_purchase
ORDER BY year_month
;
WITH
daily_purchase AS (
SELECT
dt
-- 分別擷取「年」、「月」、「日」
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(purchase_amount) AS purchase_amount
FROM purchase_log
GROUP BY dt
)
, monthly_purchase AS (
SELECT
year
, month
, AVG(purchase_amount) AS avg_amount
, SUM(purchase_amount) AS monthly
FROM daily_purchase
GROUP BY year, month
)
SELECT
concat(year, '-', month) AS year_month
, avg_amount
, monthly
, SUM(monthly) OVER(PARTITION BY year ORDER BY month ROWS UNBOUNDED PRECEDING) AS agg_amount
-- 取得12個月前的營收
, LAG(monthly, 12) OVER(ORDER BY year, month) AS last_year
-- 計算當月營收和12個月前營收的比值
, 100.0 * monthly / LAG(monthly, 12) OVER(ORDER BY year, month) AS rate
FROM monthly_purchase
ORDER BY year_month
;
多面向的蒐集數據
l 計算各類別銷售金額與小計(UNION ALL句、ROLLUP句)(分析:小計):
n 範例資歷表:
DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, item_id varchar(255)
, price integer
, category varchar(255)
, sub_category varchar(255)
);
INSERT INTO purchase_detail_log
VALUES
('2015-12-01', 1, 'U001', 'D001', 200, 'ladys_fashion', 'bag' )
, ('2015-12-08', 95, 'U002', 'D002', 300, 'dvd' , 'documentary')
, ('2015-12-09', 168, 'U003', 'D003', 500, 'game' , 'accessories')
, ('2015-12-11', 250, 'U004', 'D004', 800, 'ladys_fashion', 'jacket' )
, ('2015-12-11', 325, 'U005', 'D005', 200, 'mens_fashion' , 'jacket' )
, ('2015-12-12', 400, 'U006', 'D006', 400, 'cd' , 'classic' )
, ('2015-12-11', 475, 'U007', 'D007', 400, 'book' , 'business' )
, ('2015-12-10', 550, 'U008', 'D008', 600, 'food' , 'meats' )
, ('2015-12-10', 625, 'U009', 'D009', 600, 'food' , 'fish' )
, ('2015-12-11', 700, 'U010', 'D010', 200, 'supplement' , 'protain' )
;
DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log(
dt varchar(255)
, order_id integer
, user_id varchar(255)
, item_id varchar(255)
, price integer
, category varchar(255)
, sub_category varchar(255)
);
INSERT INTO purchase_detail_log
VALUES
('2015-12-01', 1, 'U001', 'D001', 200, 'ladys_fashion', 'bag' )
, ('2015-12-08', 95, 'U002', 'D002', 300, 'dvd' , 'documentary')
, ('2015-12-09', 168, 'U003', 'D003', 500, 'game' , 'accessories')
, ('2015-12-11', 250, 'U004', 'D004', 800, 'ladys_fashion', 'jacket' )
, ('2015-12-11', 325, 'U005', 'D005', 200, 'mens_fashion' , 'jacket' )
, ('2015-12-12', 400, 'U006', 'D006', 400, 'cd' , 'classic' )
, ('2015-12-11', 475, 'U007', 'D007', 400, 'book' , 'business' )
, ('2015-12-10', 550, 'U008', 'D008', 600, 'food' , 'meats' )
, ('2015-12-10', 625, 'U009', 'D009', 600, 'food' , 'fish' )
, ('2015-12-11', 700, 'U010', 'D010', 200, 'supplement' , 'protain' )
;
n 同時取得類別銷售金額與小計的Querys:
WITH
sub_category_amount AS (
-- 彙總小類別的銷售金額
SELECT
category AS category
, sub_category AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY category, sub_category
)
, category_amount AS (
-- 彙總大類別的銷售金額
SELECT
category
, 'all' AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY category
)
, total_amount AS (
-- 計算全體銷售總額
SELECT
'all' AS category
, 'all' AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
)
SELECT category, sub_category, amount
FROM sub_category_amount
UNION ALL
SELECT category, sub_category, amount
FROM category_amount
UNION ALL
SELECT category, sub_category, amount
FROM total_amount
;
WITH
sub_category_amount AS (
-- 彙總小類別的銷售金額
SELECT
category AS category
, sub_category AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY category, sub_category
)
, category_amount AS (
-- 彙總大類別的銷售金額
SELECT
category
, 'all' AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY category
)
, total_amount AS (
-- 計算全體銷售總額
SELECT
'all' AS category
, 'all' AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
)
SELECT category, sub_category, amount
FROM sub_category_amount
UNION ALL
SELECT category, sub_category, amount
FROM category_amount
UNION ALL
SELECT category, sub_category, amount
FROM total_amount
;
n 使用ROLLUP同時取得類別銷售金額和小計:
SELECT
COALESCE(category, 'all') AS category
, COALESCE(sub_category, 'all') AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY ROLLUP(category, sub_category)
;
SELECT
COALESCE(category, 'all') AS category
, COALESCE(sub_category, 'all') AS sub_category
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY ROLLUP(category, sub_category)
;
l 利用ABC分析判別銷售情形(SUM(~) OVER(ORDER BY~))(分析:累計百分比、ABC分析:Rank A是累積百分比前面0~70%、Rank B是累積百分比70~90%、Rank C是累積百分比90~100%):
n 範例資歷表:
同上例
同上例
n 計算營收組成的累計百分比與ABC Rank:
WITH
monthly_sales AS (
SELECT
category
-- 計算各類別的營收
, SUM(price) AS amount
FROM purchase_detail_log
-- 將數據範圍限制在目標月份
WHERE dt BETWEEN '2015-12-01' AND '2015-12-31'
GROUP BY category
)
, sales_composition_ratio AS (
SELECT
category
, amount
-- 所佔百分比: 100.0 * 類別營收 / 全體營收
, 100.0 * amount / SUM(amount) OVER() AS composition_ratio
-- 累計百分比: 100.0 * 類別累計營收 / 全體營收
, 100.0 * SUM(amount) OVER(ORDER BY amount DESC) / SUM(amount) OVER() AS cumulative_ratio
FROM monthly_sales
)
SELECT
*
-- 根據累計百分比進行分級
, CASE
WHEN cumulative_ratio BETWEEN 0 AND 70 THEN 'A'
WHEN cumulative_ratio BETWEEN 70 AND 90 THEN 'B'
WHEN cumulative_ratio BETWEEN 90 AND 100 THEN 'C'
END AS abc_rank
FROM sales_composition_ratio
ORDER BY amount DESC
;
WITH
monthly_sales AS (
SELECT
category
-- 計算各類別的營收
, SUM(price) AS amount
FROM purchase_detail_log
-- 將數據範圍限制在目標月份
WHERE dt BETWEEN '2015-12-01' AND '2015-12-31'
GROUP BY category
)
, sales_composition_ratio AS (
SELECT
category
, amount
-- 所佔百分比: 100.0 * 類別營收 / 全體營收
, 100.0 * amount / SUM(amount) OVER() AS composition_ratio
-- 累計百分比: 100.0 * 類別累計營收 / 全體營收
, 100.0 * SUM(amount) OVER(ORDER BY amount DESC) / SUM(amount) OVER() AS cumulative_ratio
FROM monthly_sales
)
SELECT
*
-- 根據累計百分比進行分級
, CASE
WHEN cumulative_ratio BETWEEN 0 AND 70 THEN 'A'
WHEN cumulative_ratio BETWEEN 70 AND 90 THEN 'B'
WHEN cumulative_ratio BETWEEN 90 AND 100 THEN 'C'
END AS abc_rank
FROM sales_composition_ratio
ORDER BY amount DESC
;
l 利用Fan chart判讀商品銷售的成長率(FIRST_VALUE視窗函數)(分析:Fan chart,是將作為基準的特定時間點設定為100%,以圖形的方式觀察之後的數值變動情形):
n 範例資歷表:
同上例
同上例
n 取得建立Fan chart所需之數據:
WITH
daily_category_amount AS (
SELECT
dt
, category
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY dt, category
)
, monthly_category_amount AS (
SELECT
concat(year, '-', month) AS year_month
, category
, SUM(amount) AS amount
FROM daily_category_amount
GROUP BY year, month, category
)
SELECT
year_month
, category
, amount
, FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS base_amount
, 100.0 * amount / FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS rate
FROM monthly_category_amount
ORDER BY year_month, category
;
WITH
daily_category_amount AS (
SELECT
dt
, category
, substring(dt, 1, 4) AS year
, substring(dt, 6, 2) AS month
, substring(dt, 9, 2) AS date
, SUM(price) AS amount
FROM purchase_detail_log
GROUP BY dt, category
)
, monthly_category_amount AS (
SELECT
concat(year, '-', month) AS year_month
, category
, SUM(amount) AS amount
FROM daily_category_amount
GROUP BY year, month, category
)
SELECT
year_month
, category
, amount
, FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS base_amount
, 100.0 * amount / FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS rate
FROM monthly_category_amount
ORDER BY year_month, category
;
l 使用直方圖彙總購買價格區間(width_bucket函數)(分析:次數分配表、直方圖):
n 範例資歷表:
同上例
同上例
n 計算各數據所屬階級,調整階級上限的值:
WITH
stats AS (
SELECT
-- 銷售價格的最大值 + 1
MAX(price) + 1 AS max_price
-- 銷售價格的最小值
, MIN(price) AS min_price
-- 全體價格的範圍 + 1
, MAX(price) + 1 - MIN(price) AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT *
FROM purchase_log_with_bucket
ORDER BY price
;
WITH
stats AS (
SELECT
-- 銷售價格的最大值 + 1
MAX(price) + 1 AS max_price
-- 銷售價格的最小值
, MIN(price) AS min_price
-- 全體價格的範圍 + 1
, MAX(price) + 1 - MIN(price) AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT *
FROM purchase_log_with_bucket
ORDER BY price
;
n 取得建立直方圖所需之數據:
WITH
stats AS (
SELECT
-- 銷售價格的最大值 + 1
MAX(price) + 1 AS max_price
-- 銷售價格的最小值
, MIN(price) AS min_price
-- 全體價格的範圍 + 1
, MAX(price) + 1 - MIN(price) AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT
bucket
-- 計算階級的上限與下限
, min_price + bucket_range * (bucket - 1) AS lower_limit
, min_price + bucket_range * bucket AS upper_limit
-- 計算次數
, COUNT(price) AS num_purchase
-- 計算各階級的合計金額
, SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;
WITH
stats AS (
SELECT
-- 銷售價格的最大值 + 1
MAX(price) + 1 AS max_price
-- 銷售價格的最小值
, MIN(price) AS min_price
-- 全體價格的範圍 + 1
, MAX(price) + 1 - MIN(price) AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT
bucket
-- 計算階級的上限與下限
, min_price + bucket_range * (bucket - 1) AS lower_limit
, min_price + bucket_range * bucket AS upper_limit
-- 計算次數
, COUNT(price) AS num_purchase
-- 計算各階級的合計金額
, SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;
n 手動調整直方圖的上限與下限:
WITH
stats AS (
SELECT
-- 價格的最大值
5000 AS max_price
-- 價格的最小值
, 0 AS min_price
-- 價格區間
, 5000 AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT
bucket
-- 計算階級的上限與下限
, min_price + bucket_range * (bucket - 1) AS lower_limit
, min_price + bucket_range * bucket AS upper_limit
-- 計算次數
, COUNT(price) AS num_purchase
-- 計算各階級的合計金額
, SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;
WITH
stats AS (
SELECT
-- 價格的最大值
5000 AS max_price
-- 價格的最小值
, 0 AS min_price
-- 價格區間
, 5000 AS range_price
-- 階級數
, 10 AS bucket_num
FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
SELECT
price
, min_price
-- 標準化價格:目標價格減去最小價格
, price - min_price AS diff
-- 階級範圍:全體價格區間除以階級數
, 1.0 * range_price / bucket_num AS bucket_range
-- 階級的判定:FLOOR(標準化價格/階級範圍)
, FLOOR(
1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
-- 由於index是從1開始所以加上1
) + 1 AS bucket
-- 階級的判定:或使用PostgreSQL內建的函數
-- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
FROM purchase_detail_log, stats
)
SELECT
bucket
-- 計算階級的上限與下限
, min_price + bucket_range * (bucket - 1) AS lower_limit
, min_price + bucket_range * bucket AS upper_limit
-- 計算次數
, COUNT(price) AS num_purchase
-- 計算各階級的合計金額
, SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;
三、彙整第五至八章出現的新語法
彙整第五至八章出現的新語法
l 利用SQL標準中所定義的CUBE語法,就可簡單地考量到所有的組合方式,並彙總數據:
-- 使用CUBE列舉action的所有組合方式
SELECT
has_purchase
, has_review
, has_favorite
, COUNT(1) AS users
FROM user_action_flag
GROUP BY CUBE(has_purchase, has_review, has_favorite)
;
-- 使用CUBE列舉action的所有組合方式
SELECT
has_purchase
, has_review
, has_favorite
, COUNT(1) AS users
FROM user_action_flag
GROUP BY CUBE(has_purchase, has_review, has_favorite)
;
l 十分位分析(Decile Analysis)這種等間距的分組方式,可使用NTILE視窗函數進行:
SELECT
user_id
, purchase_amount
, ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
FROM user_purchase_amount
;
SELECT
user_id
, purchase_amount
, ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
FROM user_purchase_amount
;
l 利用PERCENT_RANK函數以百分比呈現網頁瀏覽數的排行,PERCENT_RANK的值是用(rank - 1) / (全部數據行數 - 1)所計算出的百分比:
SELECT
session
, count
, RANK() OVER(ORDER BY count DESC) AS rank
, PERCENT_RANK() OVER(ORDER BY count DESC) AS percent_rank
FROM session_count
;
SELECT
session
, count
, RANK() OVER(ORDER BY count DESC) AS rank
, PERCENT_RANK() OVER(ORDER BY count DESC) AS percent_rank
FROM session_count
;
l LIKE運算子,利用規則排除數據:
SELECT *
FROM action_log_with_noise
WHERE
NOT
-- 列舉網路爬蟲的判定條件
(user_agent LIKE'%bot%'
OR user_agent LIKE'%crawler%'
OR user_agent LIKE'%spider%'
OR user_agent LIKE'%archiver%'
)
;
SELECT *
FROM action_log_with_noise
WHERE
NOT
-- 列舉網路爬蟲的判定條件
(user_agent LIKE'%bot%'
OR user_agent LIKE'%crawler%'
OR user_agent LIKE'%spider%'
OR user_agent LIKE'%archiver%'
)
;
l HAVING句,確認鍵值重複record的數值:
SELECT
id
, COUNT(*) AS record_num
-- 將數據彙總至陣列,並以逗點作為區隔符號轉換成字串
, string_agg(name, ',') AS name_list
, string_agg(stamp, ',') AS stamp_list
FROM mst_categories
GROUP BY id
-- 只保留有重複的ID
HAVING COUNT(*) > 1
;
SELECT
id
, COUNT(*) AS record_num
-- 將數據彙總至陣列,並以逗點作為區隔符號轉換成字串
, string_agg(name, ',') AS name_list
, string_agg(stamp, ',') AS stamp_list
FROM mst_categories
GROUP BY id
-- 只保留有重複的ID
HAVING COUNT(*) > 1
;
l 計算標準差、標準分數、偏差值:
SELECT
subject
, name
, score
-- 計算每個學科的標準差
, stddev_pop(score) OVER(PARTITION BY subject) AS stddev_pop
-- 計算每個學科的平均數
, AVG(score) OVER(PARTITION BY subject) AS avg_score
-- 計算每個分數的標準分數
, (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) AS std_value
-- 計算每個分數的偏差值
, 10.0 * (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) + 50 AS deviation
FROM exam_scores
ORDER BY subject, name
;
SELECT
subject
, name
, score
-- 計算每個學科的標準差
, stddev_pop(score) OVER(PARTITION BY subject) AS stddev_pop
-- 計算每個學科的平均數
, AVG(score) OVER(PARTITION BY subject) AS avg_score
-- 計算每個分數的標準分數
, (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) AS std_value
-- 計算每個分數的偏差值
, 10.0 * (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) + 50 AS deviation
FROM exam_scores
ORDER BY subject, name
;
沒有留言:
張貼留言