2019年11月14日 星期四

SQL商業資料分析術學習筆記

  有鑑於工作上的需求,閱讀這本由加嵜長門、田宮直人著作,朱浚賢翻譯的《SQL商業資料分析術》,除了因應系統管理員的職務所需,更因應自己對於資料科學的學習需求,閱讀之後個人認為這本書並不是針對初學者所設計,而是一本針對不同資料庫所彙總的大百科,書裡面針對同一種查詢需求,分別列出在PostgreSQLApache HiveAmazon RedshiftGoogle BigQuerySparkSQL資料庫系統上面,應該如何撰寫語法。

  恰巧適逢自己轉職的需求,需要熟悉PostgreSQL資料庫系統,故本學習筆記僅記錄PostgreSQL的語法,並僅收錄「第三章:使用SQL整理數據」以及「第四章:營收狀態相關數據的彙總、分析」的說明範例,其餘後續章節探討的議題如使用者行為、網站指標、精確度、進階數據活用術等,則因其聚焦於某特定需求,加上鮮少使用新的語法,故決定不整理、不收錄至此筆記,俟未來有使用需求,才會再重新借閱、閱讀此書。

一、使用SQL整理數據

單一數值的整理
l   將編碼值轉換成標籤(CASE語句)
n   範例資料表:
DROP TABLE IF EXISTS mst_users;
CREATE TABLE mst_users(
    user_id         varchar(255)
  , register_date   varchar(255)
  , register_device integer
);

INSERT INTO mst_users
VALUES
    ('U001', '2016-08-26', 1)
  , ('U002', '2016-08-26', 2)
  , ('U003', '2016-08-27', 3)
;
n   將編碼值置換成標籤的Query
SELECT
    user_id
    , CASE
        WHEN register_device = 1 THEN 'PC'
        WHEN register_device = 2 THEN 'SP'
        WHEN register_device = 3 THEN 'APP'
        --
要指定預設值時使用ELSE
        -- ELSE ''
    END AS device_name
FROM mst_users
;
l   URL擷取元素(URL函數、正規表達式)
n   範例資料表:
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
    stamp    varchar(255)
  , referrer text
  , url      text
);

INSERT INTO access_log
VALUES
    ('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
  , ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref'          )
  , ('2016-08-26 12:02:01', 'https://www.other.com/'                               , 'http://www.example.com/book/detail?id=002' )
;
n   Referrer擷取網域(domain)Query
SELECT
    stamp
    --
擷取referer的主機名稱
    , substring(referrer from 'https?://([^/]*)') AS referrer_host
FROM access_log
;
n   擷取URL的路徑和GET參數中的特定值:
SELECT
    stamp
    , url
    --
擷取URL的路徑和GET參數的id
    , substring(url from '//[^/]+([^?#]+)') AS path
    , substring(url from 'id=([^&]*)') AS id
FROM access_log
;
l   將字串分割至陣列(split_part函數、split函數)
n   範例資料表:
DROP TABLE IF EXISTS access_log ;
CREATE TABLE access_log (
    stamp    varchar(255)
  , referrer text
  , url      text
);

INSERT INTO access_log
VALUES
    ('2016-08-26 12:02:00', 'http://www.other.com/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video/detail?id=001')
  , ('2016-08-26 12:02:01', 'http://www.other.net/path1/index.php?k1=v1&k2=v2#Ref1', 'http://www.example.com/video#ref'          )
  , ('2016-08-26 12:02:01', 'https://www.other.com/'                               , 'http://www.example.com/book/detail?id=002' )
;
n   用斜線從URLpath中擷取出階層資訊:
SELECT
    stamp
    , url
    --
利用斜線分割Path,依階層分別擷取出階層名稱
    --
由於Path一定由斜線開頭,所以第二個元素就會是起始的階層
    , split_part(substring(url from '//[^/]+([^?#]+)'), '/', 2) AS path1
    , split_part(substring(url from '//[^/]+([^?#]+)'), '/', 3) AS path2
FROM access_log
;
l   日期與時間戳記的處理(日期/時間函數、date型別、timestamp型別、字串函數)
n   取得現在日期與時間戳記的Query
SELECT
    --
使用CURRENT_DATE常數和CURRENT_TIMESTAMP常數
    CURRENT_DATE AS dt
    , CURRENT_TIMESTAMP AS stamp
    --
希望取得不含時區的timestamp, 請使用LOCALTIMESTAMP
    -- , LOCALTIMESTAMP AS stamp
;
n   將字串轉換成日期型別、timestamp型別的Query
SELECT
    -- CAST(value AS type)
的形式來轉換資料型別
    CAST('2016-01-30' AS date) AS dt
    , CAST('2016-01-30 12:00:00' AS timestamp) AS stamp
    --
可使用type value的形式來轉換資料型別
    --
但是value必須為常數,不能設定為欄位名稱
    -- date '2016-01-30' AS dt
    -- , timestamp '2016-01-30 12:00:00' AS stamp
    --
可使用value::type的形式來轉換資料型別
    -- '2016-01-30'::date AS dt
    -- , '2016-01-30 12:00:00'::timestamp AS stamp
;
n   timestamp型別的資料中取得年、月、日...等特定單位的值:
SELECT
    stamp
    , EXTRACT(YEAR FROM stamp) AS year
    , EXTRACT(MONTH FROM stamp) AS month
    , EXTRACT(DAY FROM stamp) AS day
    , EXTRACT(HOUR FROM stamp) AS hour
FROM
    (SELECT CAST('2016-01-30 12:00:00' AS timestamp) AS stamp) AS t
;
n   從代表timestamp的字串中取得年、月、日...等值:
SELECT
    stamp
    --
下列substring函數皆可改為使用substr函數
    , substring(stamp, 1, 4) AS year
    , substring(stamp, 6, 2) AS month
    , substring(stamp, 9, 2) AS day
    , substring(stamp, 12, 2) AS hour
    --
同時擷取年和月
    , substring(stamp, 1, 7) AS year_month
FROM
    (SELECT CAST('2016-01-30 12:00:00' AS text) AS stamp) AS t
;
l   以預設值取代缺損的資料(COALESCE函數)
n   範例資料表:
DROP TABLE IF EXISTS purchase_log_with_coupon;
CREATE TABLE purchase_log_with_coupon (
    purchase_id varchar(255)
  , amount      integer
  , coupon      integer
);

INSERT INTO purchase_log_with_coupon
VALUES
    ('10001', 3280, NULL)
  , ('10002', 4650,  500)
  , ('10003', 3870, NULL)
;
n   計算折價後的銷售金額:
SELECT
    purchase_id
    , amount
    , coupon
    , amount - coupon AS discount_amount1
    , amount - COALESCE(coupon, 0) AS discount_amount2
FROM purchase_log_with_coupon
;

多個數值的整理
l   連接字串(CONCAT函數、||運算子)
n   範例資料表:
DROP TABLE IF EXISTS mst_user_location;
CREATE TABLE mst_user_location (
    user_id   varchar(255)
  , pref_name varchar(255)
  , city_name varchar(255)
);

INSERT INTO mst_user_location
VALUES
    ('U001', '
東京都', '千代田區')
  , ('U002', '
東京都', '涉谷區'  )
  , ('U003', '
千葉縣', '八千代市')
;
n   連接字串的Query
SELECT
    user_id
    , CONCAT(pref_name, city_name) AS pref_city
    --
也可使用||運算子
    -- , pref_name || city_name AS pref_city
FROM mst_user_location
;
l   比較多個數值(CASE語法、SIGN函數、greatest函數、least函數、四則運算子)
n   範例資料表:
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
    year integer
  , q1   integer
  , q2   integer
  , q3   integer
  , q4   integer
);

INSERT INTO quarterly_sales
VALUES
    (2015, 82000, 83000, 78000, 83000)
  , (2016, 85000, 85000, 80000, 81000)
  , (2017, 92000, 81000, NULL , NULL )
;
n   比較Q1Q2兩個欄位的Query
SELECT
    year
    , q1
    , q2
    -- Q1
Q2的營收變化比較
    , CASE
        WHEN q1 < q2 THEN '+'
        WHEN q1 = q2 THEN ' '
        ELSE '-'
    END AS judge_q1_q2
    --
計算Q1Q2營收差
    , q2 - q1 AS diff_q2_q1
    --
Q1Q2營收變化比較用1, 0, -1呈現
    , SIGN(q2 - q1) AS sign_q2_q1
FROM quarterly_sales
ORDER BY year
;
n   找出年間最大/最小營收的季度:
SELECT
    year
    --
取得Q1~Q4的最大營收
    , greatest(q1, q2, q3, q4) AS greatest_sales
    --
取得Q1~Q4的最小營收
    , least(q1, q2, q3, q4) AS least_sales
FROM quarterly_sales
ORDER BY year
;
n   單純計算年間平均季度營收的Query
SELECT
    year
    , (q1 + q2 + q3 + q4) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
n   使用COALESCENULL置換成0後計算平均值:
SELECT
    year
    , (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / 4 AS average
FROM quarterly_sales
ORDER BY year
;
n   只用數值不為NULL的欄位來計算平均值:
SELECT
    year
    , (COALESCE(q1, 0) + COALESCE(q2, 0) + COALESCE(q3, 0) + COALESCE(q4, 0)) / (SIGN(COALESCE(q1, 0)) + SIGN(COALESCE(q2, 0)) + SIGN(COALESCE(q3, 0)) + SIGN(COALESCE(q4, 0))) AS average
FROM quarterly_sales
ORDER BY year
;
l   計算兩個數值的比率(除法、CAST函數、CASE語法、NULLIF函數)
n   範例資料表:
DROP TABLE IF EXISTS advertising_stats;
CREATE TABLE advertising_stats (
    dt          varchar(255)
  , ad_id       varchar(255)
  , impressions integer
  , clicks      integer
);

INSERT INTO advertising_stats
VALUES
    ('2017-04-01', '001', 100000,  3000)
  , ('2017-04-01', '002', 120000,  1200)
  , ('2017-04-01', '003', 500000, 10000)
  , ('2017-04-02', '001',      0,     0)
  , ('2017-04-02', '002', 130000,  1400)
  , ('2017-04-02', '003', 620000, 15000)
;
n   整數型別資料的除法運算:
SELECT
    dt
    , ad_id
    --
PostgreSQL, 整數型別的除法運算會捨去小數點以下的數值,所以要手動轉換資料型別
    , CAST(clicks AS double precision) / impressions AS ctr
    --
在開頭用含小數的常數乘以clicks,資料型別就會自動進行轉換
    , 100.0 * clicks / impressions AS ctr_as_percent
FROM advertising_stats
WHERE dt = '2017-04-01'
ORDER BY dt, ad_id
;
n   避免除0並計算CTR
SELECT
    dt
    , ad_id
    --
CASE句設定分母不為0的條件,來避免除以0
    , CASE
        WHEN impressions > 0 THEN 100.0 * clicks / impressions
    END AS ctr_as_percent_by_case
    --
當分母為0時就轉換成NULL,來避免除以0
    , 100.0 * clicks / NULLIF(impressions, 0) AS ctr_as_percent_by_null
FROM advertising_stats
ORDER BY dt, ad_id
;
l   計算兩個值的距離(absc函數、power函數、sqrt函數、point型別、<->運算子)
n   範例資料表:
DROP TABLE IF EXISTS location_1d;
CREATE TABLE location_1d (
    x1 integer
  , x2 integer
);

INSERT INTO location_1d
VALUES
    ( 5 , 10)
  , (10 ,  5)
  , (-2 ,  4)
  , ( 3 ,  3)
  , ( 0 ,  1)
;

DROP TABLE IF EXISTS location_2d;
CREATE TABLE location_2d (
    x1 integer
  , y1 integer
  , x2 integer
  , y2 integer
);

INSERT INTO location_2d
VALUES
    (0, 0, 2, 2)
  , (3, 5, 1, 2)
  , (5, 3, 2, 1)
;
n   計算一次元數據的絕對值和方均根:
SELECT
    abs(x1 - x2) AS abs
    , sqrt(power(x1 - x2, 2)) AS rms
FROM location_1d
;
n   對二次元數據計算均方根(歐幾里得距離)
SELECT
    sqrt(power(x1 - x2, 2) + power(y1 - y2, 2)) AS dist
    --
PostgreSQL可用point型別和距離運算子<->
    -- point(x1, y1) <-> point(x2, y2) AS dist
FROM location_2d
;
l   計算日期、時間(interval型別、日期時間函數)
n   範例資料表:
DROP TABLE IF EXISTS mst_users_with_birthday;
CREATE TABLE mst_users_with_birthday (
    user_id        varchar(255)
  , register_stamp varchar(255)
  , birth_date     varchar(255)
);

INSERT INTO mst_users_with_birthday
VALUES
    ('U001', '2016-02-28 10:00:00', '2000-02-29')
  , ('U002', '2016-02-29 10:00:00', '2000-02-29')
  , ('U003', '2016-03-01 10:00:00', '2000-02-29')
;
n   計算未來或過去的日期時間:
SELECT
    user_id
    --
可藉由interval型別的數據進行四則運算
    , register_stamp::timestamp AS register_stamp
    , register_stamp::timestamp + '1 hour'::interval AS after_1_hour
    , register_stamp::timestamp - '30 minutes'::interval AS before_30_minutes
    , register_stamp::date AS register_date
    , (register_stamp::date + '1 day'::interval)::date AS after_1_day
    , (register_stamp::date - '1 month'::interval)::date AS before_1_month
FROM mst_users_with_birthday
;
n   計算兩個日期之間的天數差:
SELECT
    user_id
    --
日期型別數據之間可進行減法
    , CURRENT_DATE AS today
    , register_stamp::date AS register_date
    , CURRENT_DATE - register_stamp::date AS diff_days
FROM mst_users_with_birthday
;
n   使用age函數計算年齡:
SELECT
    user_id
    --
使用age函數和EXTRACT函數計算以年為單位的年齡
    , CURRENT_DATE AS today
    , register_stamp::date AS register_date
    , birth_date::date AS birth_date
    , EXTRACT(YEAR FROM age(birth_date::date)) AS current_age
    , EXTRACT(YEAR FROM age(register_stamp::date, birth_date::date)) AS register_age
FROM mst_users_with_birthday
;
n   將日期以整數表示並計算年齡的Query
--
計算生日為2000229日的人,在2016228日時的年齡
SELECT floor((20160228 - 20000229) / 10000) AS age;
n   從文字型別的生日計算註冊時間點和現在的年齡:
SELECT
    user_id
    , substring(register_stamp, 1, 10) AS register_date
    , birth_date
    --
計算註冊時的年齡
    , floor(
        (CAST(replace(substring(register_stamp, 1, 10), '-', '') AS integer)
        - CAST(replace(birth_date, '-', '') AS integer)
        ) / 10000
    ) AS register_age
    --
計算現在的年齡
    , floor(
        (CAST(replace(CAST(CURRENT_DATE AS text), '-', '') AS integer)
        - CAST(replace(birth_date, '-', '') AS integer)
        ) / 10000
    ) AS current_age
FROM mst_users_with_birthday
;
l   處理IP位置(inet型別、<<運算子、split_part函數、lpad函數)
n   比較inet型別的IP資料:
SELECT
    CAST('127.0.0.1' AS inet) < CAST('127.0.0.2' AS inet) AS lt
    , CAST('127.0.0.1' AS inet) > CAST('192.168.0.1' AS inet) AS gt
;
n   判別inet型別IP位置的範圍:
SELECT CAST('127.0.0.1' AS inet) << CAST('127.0.0.0/8' AS inet) AS is_contained;
n   IP位置擷取出四個octetQuery
SELECT
    ip
    , CAST(split_part(ip, '.', 1) AS integer) AS ip_part_1
    , CAST(split_part(ip, '.', 2) AS integer) AS ip_part_2
    , CAST(split_part(ip, '.', 3) AS integer) AS ip_part_3
    , CAST(split_part(ip, '.', 4) AS integer) AS ip_part_4
FROM
    --
PostgreSQL必須明確地指定資料型別
    (SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
n   IP位置以整數型別來呈現的Query
SELECT
    ip
    , CAST(split_part(ip, '.', 1) AS integer) * 2^24
    + CAST(split_part(ip, '.', 2) AS integer) * 2^16
    + CAST(split_part(ip, '.', 3) AS integer) * 2^8
    + CAST(split_part(ip, '.', 4) AS integer) * 2^0
    AS ip_integer
FROM
    --
PostgreSQL必須明確地指定資料型別
    (SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;
n   IP位置轉換成以0補位的字串:
SELECT
    ip
    , lpad(split_part(ip, '.', 1), 3, '0')
    || lpad(split_part(ip, '.', 2), 3, '0')
    || lpad(split_part(ip, '.', 3), 3, '0')
    || lpad(split_part(ip, '.', 4), 3, '0')
    AS ip_padding
FROM
    --
PostgreSQL必須明確地指定資料型別
    (SELECT CAST('192.168.0.1' AS text) AS ip) AS t
;

對單一資料表進行操作
l   掌握整體特徵(彙總函數、GROUP BY語法、window函數、OVER(...PARTITION BY~)語法)
n   範例資料表:
DROP TABLE IF EXISTS review;
CREATE TABLE review (
    user_id    varchar(255)
  , product_id varchar(255)
  , score      numeric
);

INSERT INTO review
VALUES
    ('U001', 'A001', 4.0)
  , ('U001', 'A002', 5.0)
  , ('U001', 'A003', 5.0)
  , ('U002', 'A001', 3.0)
  , ('U002', 'A002', 3.0)
  , ('U002', 'A003', 4.0)
  , ('U003', 'A001', 5.0)
  , ('U003', 'A002', 4.0)
  , ('U003', 'A003', 4.0)
;
n   使用彙總函數計算資料表整體特徵:
SELECT
    COUNT(*) AS total_count
    , COUNT(DISTINCT user_id) AS user_count
    , COUNT(DISTINCT product_id) AS product_count
    , SUM(score) AS sum
    , AVG(score) AS avg
    , MAX(score) AS max
    , MIN(score) AS min
FROM review
;
n   將資料表依使用者不同進行分組彙總:
SELECT
    user_id
    , COUNT(*) AS total_count
    , COUNT(DISTINCT product_id) AS product_count
    , SUM(score) AS sum
    , AVG(score) AS avg
    , MAX(score) AS max
    , MIN(score) AS min
FROM review
GROUP BY user_id
;
n   使用視窗函數同時處理彙總函數的結果以及原始值:
SELECT
    user_id
    , product_id
    --
單一評價分數
    , score
    --
全體平均評價分數
    , AVG(score) OVER() AS avg_score
    --
使用者平均評價分數
    , AVG(score) OVER(PARTITION BY user_id) AS user_avg_score
    --
單一評價分數和使用者平均評價分數的差
    , score - AVG(score) OVER(PARTITION BY user_id) AS user_avg_score_diff
FROM review
;
l   群組中排序(視窗函數、OVER(...ORDER BY~)句、OVER(...ROWS~))
n   範例資料表:
DROP TABLE IF EXISTS popular_products;
CREATE TABLE popular_products (
    product_id varchar(255)
  , category   varchar(255)
  , score      numeric
);

INSERT INTO popular_products
VALUES
    ('A001', 'action', 94)
  , ('A002', 'action', 81)
  , ('A003', 'action', 78)
  , ('A004', 'action', 64)
  , ('D001', 'drama' , 90)
  , ('D002', 'drama' , 82)
  , ('D003', 'drama' , 78)
  , ('D004', 'drama' , 58)
;
n   利用視窗函數的ORDER BY句,設定資料表內的順序:
SELECT
    product_id
    , score
    --
score的降冪順序給予唯一的編號
    , ROW_NUMBER() OVER(ORDER BY score DESC) AS row
    --
允許同樣順位編號的排行方式,同順位之後會跳號
    , RANK() OVER(ORDER BY score DESC) AS rank
    --
允許同順位,同順位之後不跳號的排行方式
    , DENSE_RANK() OVER(ORDER BY score DESC) AS dense_rank
    --
取得目前資料行之前的資料行的值
    , LAG(product_id) OVER(ORDER BY score DESC) AS lag1
    , LAG(product_id, 2) OVER(ORDER BY score DESC) AS lag2
    --
取得目前資料行之後的資料行的值
    , LEAD(product_id) OVER(ORDER BY score DESC) AS lead1
    , LEAD(product_id, 2) OVER(ORDER BY score DESC) AS lead2
FROM popular_products
ORDER BY row
;
n   搭配使用ORDER BY句與彙總函數的Query
SELECT
    product_id
    , score
    --
score的降冪順序給予唯一的編號
    , ROW_NUMBER() OVER(ORDER BY score DESC) AS row
    --
計算從排序前面的紀錄到目前資料行的累計加總分數(score)
    , SUM(score) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_score
    --
計算目前資料行與前後各一筆資料行,共三筆資料行的平均分數
    , AVG(score) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_avg
    --
取得排行最前面的商品ID
    , FIRST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_value
    --
取得排行最後面的商品ID
    , LAST_VALUE(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_value
FROM popular_products
ORDER BY row
;
n   在每個指定的視窗框架內彙總全部的商品ID
SELECT
    product_id
    --
score的降冪順序給予唯一的編號
    , ROW_NUMBER() OVER(ORDER BY score DESC) AS row
    --
以排序的最前端到最後端作為範圍,彙總商品ID
    , array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS whole_agg
    --
以排序的最前端到目前的資料行作為範圍,彙總商品ID
    , array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_agg
    --
將排序中的上一筆資料行到後一筆資料行作為處理範圍,彙總商品ID
    , array_agg(product_id) OVER(ORDER BY score DESC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS local_agg
FROM popular_products
WHERE category = 'action'
ORDER BY row
;
n   用視窗函數分組計算各類別內的順位:
SELECT
    category
    , product_id
    , score
    --
各類別內依分數進行排序,並給予唯一的順位編號
    , ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS row
    --
各類別內以允許同順位的方式進行排序,並給予順位編號
    , RANK() OVER(PARTITION BY category ORDER BY score DESC) AS rank
    --
各類別內允許同順位,以同順位之後不跳號的方式進行排序
    , DENSE_RANK() OVER(PARTITION BY category ORDER BY score DESC) AS dense_rank
FROM popular_products
ORDER BY category, row
;
n   擷取各類別中排行前兩名的商品:
SELECT *
FROM
    --
在子查詢中計算排名
    (SELECT
        category
        , product_id
        , score
        --
各類別內分別依分數排序,給予唯一的順位編號
        , ROW_NUMBER() OVER(PARTITION BY category ORDER BY score DESC) AS rank
    FROM popular_products
    ) AS popular_products_with_rank
--
外側的查詢設定排名條件
WHERE rank <= 2
ORDER BY category, rank
;
n   取得各類別分組分數第一名的商品:
--
使用DISTINCT句排除重複的項目
SELECT DISTINCT
    category
    --
取得各類別分組第一名的商品ID
    , FIRST_VALUE(product_id) OVER(PARTITION BY category ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS product_id
FROM popular_products
;
l   將縱向數據轉換成橫向數據(MAX(CASE~)語法、string_agg函數、listagg函數、collect_list函數)
n   範例資料表:
DROP TABLE IF EXISTS daily_kpi;
CREATE TABLE daily_kpi (
    dt        varchar(255)
  , indicator varchar(255)
  , val       integer
);

INSERT INTO daily_kpi
VALUES
    ('2017-01-01', 'impressions', 1800)
  , ('2017-01-01', 'sessions'   ,  500)
  , ('2017-01-01', 'users'      ,  200)
  , ('2017-01-02', 'impressions', 2000)
  , ('2017-01-02', 'sessions'   ,  700)
  , ('2017-01-02', 'users'      ,  250)
;

DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log (
    purchase_id integer
  , product_id  varchar(255)
  , price       integer
);

INSERT INTO purchase_detail_log
VALUES
    (100001, 'A001', 300)
  , (100001, 'A002', 400)
  , (100001, 'A003', 200)
  , (100002, 'D001', 500)
  , (100002, 'D002', 300)
  , (100003, 'A001', 300)
;
n   將以資料行形式儲存的指標數值轉換成以列的形式儲存:
SELECT
    dt
    , MAX(CASE WHEN indicator = 'impressions' THEN val END) AS impressions
    , MAX(CASE WHEN indicator = 'sessions' THEN val END) AS sessions
    , MAX(CASE WHEN indicator = 'users' THEN val END) AS users
FROM daily_kpi
GROUP BY dt
ORDER BY dt
;
n   將資料行彙總成含有區隔符號的字串:
SELECT
    purchase_id
    --
將商品ID集中儲存於陣列,再轉換成以逗號為區隔符號的字串
    , string_agg(product_id, ',') AS product_ids
    , SUM(price) AS amount
FROM purchase_detail_log
GROUP BY purchase_id
ORDER BY purchase_id
;
l   將橫向數據轉換成縱向數據(unnest函數、explode函數、CROSS JOINLATERAL VIEWregexp_split_to_table函數)
n   範例資料表:
DROP TABLE IF EXISTS quarterly_sales;
CREATE TABLE quarterly_sales (
    year integer
  , q1   integer
  , q2   integer
  , q3   integer
  , q4   integer
);

INSERT INTO quarterly_sales
VALUES
    (2015, 82000, 83000, 78000, 83000)
  , (2016, 85000, 85000, 80000, 81000)
  , (2017, 92000, 81000, NULL , NULL )
;
n   使用連號資料表進行數據的行列轉換:
SELECT
    q.year
    --
顯示從Q1Q4的標籤名稱
    , CASE
        WHEN p.idx = 1 THEN 'q1'
        WHEN p.idx = 2 THEN 'q2'
        WHEN p.idx = 3 THEN 'q3'
        WHEN p.idx = 4 THEN 'q4'
    END AS quarter
    --
顯示從Q1Q4的營收金額
    , CASE
        WHEN p.idx = 1 THEN q.q1
        WHEN p.idx = 2 THEN q.q2
        WHEN p.idx = 3 THEN q.q3
        WHEN p.idx = 4 THEN q.q4
    END AS sales
FROM
    quarterly_sales AS q
  CROSS JOIN
    --
建立和欲展開之數據數目相同的連號資料表
    (SELECT 1 AS idx
    UNION ALL SELECT 2 AS idx
    UNION ALL SELECT 3 AS idx
    UNION ALL SELECT 4 AS idx
    ) AS p
;
n   利用資料表函數將陣列展開至record
SELECT unnest(ARRAY['A001', 'A002', 'A003']) AS product_id;
n   範例資料表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
    purchase_id integer
  , product_ids  varchar(255)
);

INSERT INTO purchase_log
VALUES
    (100001, 'A001,A002,A003')
  , (100002, 'D001,D002')
  , (100003, 'A001')
;
n   使用資料表函數展開含有逗點區隔符號的數據:
SELECT
    purchase_id
    , product_id
FROM
    purchase_log AS p
  -- string_to_array
利用函數將字串轉換成陣列,再用unnest函數轉換成資料表
  CROSS JOIN unnest(string_to_array(product_ids, ',')) AS product_id
;
n   利用PostgreSQL展開含有逗點區隔符號的數據:
SELECT
    purchase_id
    --
同時切割含有逗點區隔符號的字串並展開至資料行(record)
    , regexp_split_to_table(product_ids, ',') AS product_id
FROM purchase_log
;
n   建立連號資料表:
SELECT *
FROM (
    SELECT 1 AS idx
    UNION ALL SELECT 2 AS idx
    UNION ALL SELECT 3 AS idx
    ) AS pivot
;
n   split_part的使用範例:
SELECT
    split_part('A001,A002,A003', ',', 1) AS part_1
    , split_part('A001,A002,A003', ',', 2) AS part_2
    , split_part('A001,A002,A003', ',', 3) AS part_3
;
n   用字串文字數的差值計算商品數:
SELECT
    purchase_id
    , product_ids
    --
從商品ID的字串中去除逗號,並利用文字數差值,計算商品數目
    , 1 + char_length(product_ids) - char_length(replace(product_ids, ',', '')) AS product_num
FROM purchase_log
;
n   使用連號資料表將字串展開至資料行:
SELECT
    l.purchase_id
    , l.product_ids
    --
依商品數目給予聯結的編號
    , p.idx
    --
藉由逗號區隔符號分割字串,取出第idx的元素
    , split_part(l.product_ids, ',', p.idx) AS product_id
FROM
    purchase_log AS l
  JOIN
    (SELECT 1 AS idx
    UNION ALL SELECT 2 AS idx
    UNION ALL SELECT 3 AS idx
    ) AS p
    --
只和連號資料表id在商品數目以下的部分進行聯結
    ON p.idx <= (1 + char_length(l.product_ids) - char_length(replace(l.product_ids, ',', '')))
;

對複數資料表進行操作
l   縱向聯結複數資料表(UNION ALL語法)
n   範例資料表:
DROP TABLE IF EXISTS app1_mst_users;
CREATE TABLE app1_mst_users (
    user_id varchar(255)
  , name    varchar(255)
  , email   varchar(255)
);

INSERT INTO app1_mst_users
VALUES
    ('U001', 'Sato'  , 'sato@example.com'  )
  , ('U002', 'Suzuki', 'suzuki@example.com')
;

DROP TABLE IF EXISTS app2_mst_users;
CREATE TABLE app2_mst_users (
    user_id varchar(255)
  , name    varchar(255)
  , phone   varchar(255)
);

INSERT INTO app2_mst_users
VALUES
    ('U001', 'Ito'   , '080-xxxx-xxxx')
  , ('U002', 'Tanaka', '070-xxxx-xxxx')
;
n   使用UNION ALL句縱向聯結資料表:
SELECT
    'app1' AS app_name
    , user_id
    , name
    , email
FROM app1_mst_users
UNION ALL
SELECT
    'app2' AS app_name
    , user_id
    , name
    , NULL AS email
FROM app2_mst_users
;
l   橫向聯結複數資料表(LEFT JOIN、關聯子查詢)
n   範例資料表:
DROP TABLE IF EXISTS mst_categories;
CREATE TABLE mst_categories (
    category_id integer
  , name        varchar(255)
);

INSERT INTO mst_categories
VALUES
    (1, 'dvd' )
  , (2, 'cd'  )
  , (3, 'book')
;

DROP TABLE IF EXISTS category_sales;
CREATE TABLE category_sales (
    category_id integer
  , sales       integer
);

INSERT INTO category_sales
VALUES
    (1, 850000)
  , (2, 500000)
;

DROP TABLE IF EXISTS product_sale_ranking;
CREATE TABLE product_sale_ranking (
    category_id integer
  , rank        integer
  , product_id  varchar(255)
  , sales       integer
);

INSERT INTO product_sale_ranking
VALUES
    (1, 1, 'D001', 50000)
  , (1, 2, 'D002', 20000)
  , (1, 3, 'D003', 10000)
  , (2, 1, 'C001', 30000)
  , (2, 2, 'C002', 20000)
  , (2, 3, 'C003', 10000)
;
n   橫向聯結複數資料表:
SELECT
    m.category_id
    , m.name
    , s.sales
    , r.product_id AS sale_product
FROM
    mst_categories AS m
  JOIN
    --
聯結各類別銷售金額
    category_sales AS s
    ON m.category_id = s.category_id
  JOIN
    --
聯結各類別商品
    product_sale_ranking AS r
    ON m.category_id = r.category_id
;
n   不變動資料表行數並進行複數資料表的橫向聯結:
SELECT
    m.category_id
    , m.name
    , s.sales
    , r.product_id AS top_sale_product
FROM
    mst_categories AS m
  --
使用左外部聯結保留無法建立聯結的record
  LEFT JOIN
    --
聯結各類別銷售金額
    category_sales AS s
    ON m.category_id = s.category_id
  --
使用左外部聯結保留無法建立聯結的record
  LEFT JOIN
    --
聯結各類別銷售第一名的商品
    product_sale_ranking AS r
    ON m.category_id = r.category_id
    AND r.rank = 1
;
n   使用關聯子查詢橫向聯結複數資料表:
SELECT
    m.category_id
    , m.name
    --
用關聯子查詢取得各類別的銷售金額
    , (SELECT s.sales
        FROM category_sales AS s
        WHERE m.category_id = s.category_id
    ) AS sales
    --
用關聯子查詢取得銷售第一的商品(不需使用rank欄位設定條件)
    , (SELECT r.product_id
        FROM product_sale_ranking AS r
        WHERE m.category_id = r.category_id
        ORDER BY sales DESC
        LIMIT 1
    ) AS top_sale_product
FROM mst_categories AS m
;
l   01表示條件旗標(CASE句、SIGN函數)
n   範例資料表:
DROP TABLE IF EXISTS mst_users_with_card_number;
CREATE TABLE mst_users_with_card_number (
    user_id     varchar(255)
  , card_number varchar(255)
);

INSERT INTO mst_users_with_card_number
VALUES
    ('U001', '1234-xxxx-xxxx-xxxx')
  , ('U002', NULL                 )
  , ('U003', '5678-xxxx-xxxx-xxxx')
;

DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log (
    purchase_id integer
  , user_id     varchar(255)
  , amount      integer
  , stamp       varchar(255)
);

INSERT INTO purchase_log
VALUES
    (10001, 'U001', 200, '2017-01-30 10:00:00')
  , (10002, 'U001', 500, '2017-02-10 10:00:00')
  , (10003, 'U001', 200, '2017-02-12 10:00:00')
  , (10004, 'U002', 800, '2017-03-01 10:00:00')
  , (10005, 'U002', 400, '2017-03-02 10:00:00')
;
n   01的條件旗標來表示信用卡登錄與購買紀錄的狀態:
SELECT
    m.user_id
    , m.card_number
    , COUNT(p.user_id) AS purchase_count
    --
有登記信用卡號碼時用1,沒有登記時用0來表示
    , CASE WHEN m.card_number IS NOT NULL THEN 1 ELSE 0 END AS has_card
    --
有購買紀錄時為1,沒有時為0
    , SIGN(COUNT(p.user_id)) AS has_purchased
FROM
    mst_users_with_card_number AS m
  LEFT JOIN
    purchase_log AS p
    ON m.user_id = p.user_id
GROUP BY
    m.user_id, m.card_number
;
l   將資料表命名後重複利用(Common Table Expression, CTE(WITH))
n   範例資料表:
DROP TABLE IF EXISTS product_sales;
CREATE TABLE product_sales (
    category_name varchar(255)
  , product_id    varchar(255)
  , sales         integer
);

INSERT INTO product_sales
VALUES
    ('dvd' , 'D001', 50000)
  , ('dvd' , 'D002', 20000)
  , ('dvd' , 'D003', 10000)
  , ('cd'  , 'C001', 30000)
  , ('cd'  , 'C002', 20000)
  , ('cd'  , 'C003', 10000)
  , ('book', 'B001', 20000)
  , ('book', 'B002', 15000)
  , ('book', 'B003', 10000)
  , ('book', 'B004',  5000)
;
n   綜合顯示各類別的排行榜:
WITH
product_sale_ranking AS (
    SELECT
        category_name
        , product_id
        , sales
        , ROW_NUMBER() OVER(PARTITION BY category_name ORDER BY sales DESC) AS rank
    FROM product_sales
)
, mst_rank AS (
    SELECT DISTINCT rank
    FROM product_sale_ranking
)
SELECT
    m.rank
    , r1.product_id AS dvd
    , r1.sales AS dvd_sales
    , r2.product_id AS cd
    , r2.sales AS cd_sales
    , r3.product_id AS book
    , r3.sales AS book_sales
FROM
    mst_rank AS m
  LEFT JOIN
    product_sale_ranking AS r1
    ON m.rank = r1.rank
    AND r1.category_name = 'dvd'
  LEFT JOIN
    product_sale_ranking AS r2
    ON m.rank = r2.rank
    AND r2.category_name = 'cd'
  LEFT JOIN
    product_sale_ranking AS r3
    ON m.rank = r3.rank
    AND r3.category_name = 'book'
ORDER BY m.rank
;
l   建立虛擬資料表(UNION ALL句、VALUE句、explode函數、generate_series函數)
n   範例資料表:
DROP TABLE IF EXISTS users;
CREATE TABLE users(
    user_id         varchar(255)
  , regist_device integer
);

INSERT INTO users
VALUES
    ('U001', 1)
  , ('U002', 2)
  , ('U003', 3)
;
n   使用虛擬資料表將編碼值轉換成標籤:
WITH
mst_devices AS (
    SELECT 1 AS device_id, 'PC' AS device_name
    UNION ALL SELECT 2 AS device_id, 'SP' AS device_name
    UNION ALL SELECT 3 AS device_id, 'APP' AS device_name
)
SELECT
    u.user_id
    , d.device_name
FROM
    users AS u
  LEFT JOIN
    mst_devices AS d
    ON u.regist_device = d.device_id
;
n   VALUES句動態建立資料表:
WITH
mst_devices(device_id, device_name) AS (
    VALUES
        (1, 'PC')
        , (2, 'SP')
        , (3, 'APP')
)
SELECT *
FROM mst_devices
;
n   建立有連號的虛擬資料表:
WITH
series AS (
    --
建立15的連續編號
    SELECT generate_series(1, 5) AS idx
)
SELECT *
FROM series
;

二、營收狀態相關數據的彙總、分析

沿著時間軸蒐集數據
l   彙總每日營收(GROUP BY句、SUM函數、AVG函數)(分析:加總、平均)
n   範例資歷表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
    dt              varchar(255)
  , order_id        integer
  , user_id         varchar(255)
  , purchase_amount integer
);

INSERT INTO purchase_log
VALUES
    ('2014-01-01',  1, 'rhwpvvitou', 13900)
  , ('2014-01-01',  2, 'hqnwoamzic', 10616)
  , ('2014-01-02',  3, 'tzlmqryunr', 21156)
  , ('2014-01-02',  4, 'wkmqqwbyai', 14893)
  , ('2014-01-03',  5, 'ciecbedwbq', 13054)
  , ('2014-01-03',  6, 'svgnbqsagx', 24384)
  , ('2014-01-03',  7, 'dfgqftdocu', 15591)
  , ('2014-01-04',  8, 'sbgqlzkvyn',  3025)
  , ('2014-01-04',  9, 'lbedmngbol', 24215)
  , ('2014-01-04', 10, 'itlvssbsgx',  2059)
  , ('2014-01-05', 11, 'jqcmmguhik',  4235)
  , ('2014-01-05', 12, 'jgotcrfeyn', 28013)
  , ('2014-01-05', 13, 'pgeojzoshx', 16008)
  , ('2014-01-06', 14, 'msjberhxnx',  1980)
  , ('2014-01-06', 15, 'tlhbolohte', 23494)
  , ('2014-01-06', 16, 'gbchhkcotf',  3966)
  , ('2014-01-07', 17, 'zfmbpvpzvu', 28159)
  , ('2014-01-07', 18, 'yauwzpaxtx',  8715)
  , ('2014-01-07', 19, 'uyqboqfgex', 10805)
  , ('2014-01-08', 20, 'hiqdkrzcpq',  3462)
  , ('2014-01-08', 21, 'zosbvlylpv', 13999)
  , ('2014-01-08', 22, 'bwfbchzgnl',  2299)
  , ('2014-01-09', 23, 'zzgauelgrt', 16475)
  , ('2014-01-09', 24, 'qrzfcwecge',  6469)
  , ('2014-01-10', 25, 'njbpsrvvcq', 16584)
  , ('2014-01-10', 26, 'cyxfgumkst', 11339)
;
n   彙總每日營收和平均購買金額的Query程式:
SELECT
    dt
    , COUNT(*) AS purchase_count
    , SUM(purchase_amount) AS total_amount
    , AVG(purchase_amount) AS avg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
l   使用移動平均觀察每日的變動(OVER(ORDER BY~))(分析:移動平均)
n   範例資歷表:
同上例
n   彙總每日營收與七日移動平均的Query
SELECT
    dt
    , SUM(purchase_amount) AS total_amount
    --
計算過去最多七日間的平均營收金額
    , AVG(SUM(purchase_amount)) OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS seven_day_avg
    --
嚴謹計算過去七日間的平均營收金額
    , CASE
        WHEN
            7 = COUNT(*)
            OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
        THEN
            AVG(SUM(purchase_amount))
            OVER(ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
    END AS seven_day_avg_strict
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
l   計算當月累計營收(OVER(PARTITION BY~ ORDER BY~))(分析:累計)
n   範例資歷表:
同上例
n   彙總每日營收與當月累計營收的Query
SELECT
    dt
    --
使用substringsubstr擷取「年-月」的部份
    , substring(dt, 1, 7) AS year_month
    , SUM(purchase_amount) AS total_amount
    , SUM(SUM(purchase_amount)) OVER(PARTITION BY substring(dt, 1, 7) ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM purchase_log
GROUP BY dt
ORDER BY dt
;
n   daily_purchase資料表計算當月累計營收的Query
WITH
daily_purchase AS (
    SELECT
        dt
        --
分別擷取「年」、「月」、「日」
        , substring(dt, 1, 4) AS year
        , substring(dt, 6, 2) AS month
        , substring(dt, 9, 2) AS date
        , SUM(purchase_amount) AS purchase_amount
    FROM purchase_log
    GROUP BY dt
)
SELECT
    dt
    , concat(year, '-', month) AS year_month
    , purchase_amount
    , SUM(purchase_amount) OVER(PARTITION BY year, month ORDER BY dt ROWS UNBOUNDED PRECEDING) AS agg_amount
FROM daily_purchase
ORDER BY dt
;
l   計算每月營收的年成長率(SUM(CASE~END))(分析:年成長率)
n   範例資歷表:
DROP TABLE IF EXISTS purchase_log;
CREATE TABLE purchase_log(
    dt              varchar(255)
  , order_id        integer
  , user_id         varchar(255)
  , purchase_amount integer
);

INSERT INTO purchase_log
VALUES
    ('2014-01-01',    1, 'rhwpvvitou', 13900)
  , ('2014-02-08',   95, 'chtanrqtzj', 28469)
  , ('2014-03-09',  168, 'bcqgtwxdgq', 18899)
  , ('2014-04-11',  250, 'kdjyplrxtk', 12394)
  , ('2014-05-11',  325, 'pgnjnnapsc',  2282)
  , ('2014-06-12',  400, 'iztgctnnlh', 10180)
  , ('2014-07-11',  475, 'eucjmxvjkj',  4027)
  , ('2014-08-10',  550, 'fqwvlvndef',  6243)
  , ('2014-09-10',  625, 'mhwhxfxrxq',  3832)
  , ('2014-10-11',  700, 'wyrgiyvaia',  6716)
  , ('2014-11-10',  775, 'cwpdvmhhwh', 16444)
  , ('2014-12-10',  850, 'eqeaqvixkf', 29199)
  , ('2015-01-09',  925, 'efmclayfnr', 22111)
  , ('2015-02-10', 1000, 'qnebafrkco', 11965)
  , ('2015-03-12', 1075, 'gsvqniykgx', 20215)
  , ('2015-04-12', 1150, 'ayzvjvnocm', 11792)
  , ('2015-05-13', 1225, 'knhevkibbp', 18087)
  , ('2015-06-10', 1291, 'wxhxmzqxuw', 18859)
  , ('2015-07-10', 1366, 'krrcpumtzb', 14919)
  , ('2015-08-08', 1441, 'lpglkecvsl', 12906)
  , ('2015-09-07', 1516, 'mgtlsfgfbj',  5696)
  , ('2015-10-07', 1591, 'trgjscaajt', 13398)
  , ('2015-11-06', 1666, 'ccfbjyeqrb',  6213)
  , ('2015-12-05', 1741, 'onooskbtzp', 26024)
;
n   計算每月營收和年成長率:
WITH
daily_purchase AS (
    SELECT
        dt
        --
分別擷取「年」、「月」、「日」
        , substring(dt, 1, 4) AS year
        , substring(dt, 6, 2) AS month
        , substring(dt, 9, 2) AS date
        , SUM(purchase_amount) AS purchase_amount
    FROM purchase_log
    GROUP BY dt
)
SELECT
    month
    , SUM(CASE year WHEN '2014' THEN purchase_amount END) AS amount_2014
    , SUM(CASE year WHEN '2015' THEN purchase_amount END) AS amount_2015
    , 100.0 * SUM(CASE year WHEN '2015' THEN purchase_amount END) / SUM(CASE year WHEN '2014' THEN purchase_amount END) AS rate
FROM daily_purchase
GROUP BY month
ORDER BY month
;
l   利用Z圖表確認業績的變化(SUM(CASE~END) OVER(ORDER BY~))(分析:Z圖表,建立Z圖表需要每月營收、累計營收、移動年營收三個指標)
n   範例資歷表:
同上例
n   建立2015年營收的Z圖表:
WITH
daily_purchase AS (
    SELECT
        dt
        --
分別擷取「年」、「月」、「日」
        , substring(dt, 1, 4) AS year
        , substring(dt, 6, 2) AS month
        , substring(dt, 9, 2) AS date
        , SUM(purchase_amount) AS purchase_amount
    FROM purchase_log
    GROUP BY dt
)
, monthly_amount AS (
    --
彙總每月營收
    SELECT
        year
        , month
        , SUM(purchase_amount) AS amount
    FROM daily_purchase
    GROUP BY year, month
)
, calc_index AS (
    SELECT
        year
        , month
        , amount
        --
計算2015年的累計營收
        , SUM(CASE WHEN year = '2015' THEN amount END) OVER(ORDER BY year, month ROWS UNBOUNDED PRECEDING) AS agg_amount
        --
計算從當月開始到11個月之前的合計營收(移動年營收)
        , SUM(amount) OVER(ORDER BY year, month ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS year_avg_amount
    FROM monthly_amount
    ORDER BY year, month
)
--
最後只擷取出2015年的數據
SELECT
    concat(year, '-', month) AS year_month
    , amount
    , agg_amount
    , year_avg_amount
FROM calc_index
WHERE year = '2015'
ORDER BY year_month
;
l   掌握影響營收的重要因素:
n   範例資歷表:
同上例
n   彙總營收相關指標的Query
WITH
daily_purchase AS (
    SELECT
        dt
        --
分別擷取「年」、「月」、「日」
        , substring(dt, 1, 4) AS year
        , substring(dt, 6, 2) AS month
        , substring(dt, 9, 2) AS date
        , SUM(purchase_amount) AS purchase_amount
    FROM purchase_log
    GROUP BY dt
)
, monthly_purchase AS (
    SELECT
        year
        , month
        , AVG(purchase_amount) AS avg_amount
        , SUM(purchase_amount) AS monthly
    FROM daily_purchase
    GROUP BY year, month
)
SELECT
    concat(year, '-', month) AS year_month
    , avg_amount
    , monthly
    , SUM(monthly) OVER(PARTITION BY year ORDER BY month ROWS UNBOUNDED PRECEDING) AS agg_amount
    --
取得12個月前的營收
    , LAG(monthly, 12) OVER(ORDER BY year, month) AS last_year
    --
計算當月營收和12個月前營收的比值
    , 100.0 * monthly / LAG(monthly, 12) OVER(ORDER BY year, month) AS rate
FROM monthly_purchase
ORDER BY year_month
;

多面向的蒐集數據
l   計算各類別銷售金額與小計(UNION ALL句、ROLLUP)(分析:小計)
n   範例資歷表:
DROP TABLE IF EXISTS purchase_detail_log;
CREATE TABLE purchase_detail_log(
    dt           varchar(255)
  , order_id     integer
  , user_id      varchar(255)
  , item_id      varchar(255)
  , price        integer
  , category     varchar(255)
  , sub_category varchar(255)
);

INSERT INTO purchase_detail_log
VALUES
    ('2015-12-01',   1, 'U001', 'D001', 200, 'ladys_fashion', 'bag'        )
  , ('2015-12-08',  95, 'U002', 'D002', 300, 'dvd'          , 'documentary')
  , ('2015-12-09', 168, 'U003', 'D003', 500, 'game'         , 'accessories')
  , ('2015-12-11', 250, 'U004', 'D004', 800, 'ladys_fashion', 'jacket'     )
  , ('2015-12-11', 325, 'U005', 'D005', 200, 'mens_fashion' , 'jacket'     )
  , ('2015-12-12', 400, 'U006', 'D006', 400, 'cd'           , 'classic'    )
  , ('2015-12-11', 475, 'U007', 'D007', 400, 'book'         , 'business'   )
  , ('2015-12-10', 550, 'U008', 'D008', 600, 'food'         , 'meats'      )
  , ('2015-12-10', 625, 'U009', 'D009', 600, 'food'         , 'fish'       )
  , ('2015-12-11', 700, 'U010', 'D010', 200, 'supplement'   , 'protain'    )
;
n   同時取得類別銷售金額與小計的Querys
WITH
sub_category_amount AS (
    --
彙總小類別的銷售金額
    SELECT
        category AS category
        , sub_category AS sub_category
        , SUM(price) AS amount
    FROM purchase_detail_log
    GROUP BY category, sub_category
)
, category_amount AS (
    --
彙總大類別的銷售金額
    SELECT
        category
        , 'all' AS sub_category
        , SUM(price) AS amount
    FROM purchase_detail_log
    GROUP BY category
)
, total_amount AS (
    --
計算全體銷售總額
    SELECT
        'all' AS category
        , 'all' AS sub_category
        , SUM(price) AS amount
    FROM purchase_detail_log
)
SELECT category, sub_category, amount
FROM sub_category_amount
UNION ALL
SELECT category, sub_category, amount
FROM category_amount
UNION ALL
SELECT category, sub_category, amount
FROM total_amount
;
n   使用ROLLUP同時取得類別銷售金額和小計:
SELECT
    COALESCE(category, 'all') AS category
    , COALESCE(sub_category, 'all') AS sub_category
    , SUM(price) AS amount
FROM purchase_detail_log
GROUP BY ROLLUP(category, sub_category)
;
l   利用ABC分析判別銷售情形(SUM(~) OVER(ORDER BY~))(分析:累計百分比、ABC分析:Rank A是累積百分比前面0~70%Rank B是累積百分比70~90%Rank C是累積百分比90~100%)
n   範例資歷表:
同上例
n   計算營收組成的累計百分比與ABC Rank
WITH
monthly_sales AS (
    SELECT
        category
        --
計算各類別的營收
        , SUM(price) AS amount
    FROM purchase_detail_log
    --
將數據範圍限制在目標月份
    WHERE dt BETWEEN '2015-12-01' AND '2015-12-31'
    GROUP BY category
)
, sales_composition_ratio AS (
    SELECT
        category
        , amount
        --
所佔百分比: 100.0 * 類別營收 / 全體營收
        , 100.0 * amount / SUM(amount) OVER() AS composition_ratio
        --
累計百分比: 100.0 * 類別累計營收 / 全體營收
        , 100.0 * SUM(amount) OVER(ORDER BY amount DESC) / SUM(amount) OVER() AS cumulative_ratio
    FROM monthly_sales
)
SELECT
    *
    --
根據累計百分比進行分級
    , CASE
        WHEN cumulative_ratio BETWEEN 0 AND 70 THEN 'A'
        WHEN cumulative_ratio BETWEEN 70 AND 90 THEN 'B'
        WHEN cumulative_ratio BETWEEN 90 AND 100 THEN 'C'
    END AS abc_rank
FROM sales_composition_ratio
ORDER BY amount DESC
;
l   利用Fan chart判讀商品銷售的成長率(FIRST_VALUE視窗函數)(分析:Fan chart,是將作為基準的特定時間點設定為100%,以圖形的方式觀察之後的數值變動情形)
n   範例資歷表:
同上例
n   取得建立Fan chart所需之數據:
WITH
daily_category_amount AS (
    SELECT
        dt
        , category
        , substring(dt, 1, 4) AS year
        , substring(dt, 6, 2) AS month
        , substring(dt, 9, 2) AS date
        , SUM(price) AS amount
    FROM purchase_detail_log
    GROUP BY dt, category
)
, monthly_category_amount AS (
    SELECT
        concat(year, '-', month) AS year_month
        , category
        , SUM(amount) AS amount
    FROM daily_category_amount
    GROUP BY year, month, category
)
SELECT
    year_month
    , category
    , amount
    , FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS base_amount
    , 100.0 * amount / FIRST_VALUE(amount) OVER(PARTITION BY category ORDER BY year_month, category ROWS UNBOUNDED PRECEDING) AS rate
FROM monthly_category_amount
ORDER BY year_month, category
;
l   使用直方圖彙總購買價格區間(width_bucket函數)(分析:次數分配表、直方圖)
n   範例資歷表:
同上例
n   計算各數據所屬階級,調整階級上限的值:
WITH
stats AS (
    SELECT
        --
銷售價格的最大值 + 1
        MAX(price) + 1 AS max_price
        --
銷售價格的最小值
        , MIN(price) AS min_price
        --
全體價格的範圍 + 1
        , MAX(price) + 1 - MIN(price) AS range_price
        --
階級數
        , 10 AS bucket_num
    FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
    SELECT
        price
        , min_price
        --
標準化價格:目標價格減去最小價格
        , price - min_price AS diff
        --
階級範圍:全體價格區間除以階級數
        , 1.0 * range_price / bucket_num AS bucket_range
        --
階級的判定:FLOOR(標準化價格/階級範圍)
        , FLOOR(
            1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
            --
由於index是從1開始所以加上1
        ) + 1 AS bucket
        --
階級的判定:或使用PostgreSQL內建的函數
        -- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
    FROM purchase_detail_log, stats
)
SELECT *
FROM purchase_log_with_bucket
ORDER BY price
;
n   取得建立直方圖所需之數據:
WITH
stats AS (
    SELECT
        --
銷售價格的最大值 + 1
        MAX(price) + 1 AS max_price
        --
銷售價格的最小值
        , MIN(price) AS min_price
        --
全體價格的範圍 + 1
        , MAX(price) + 1 - MIN(price) AS range_price
        --
階級數
        , 10 AS bucket_num
    FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
    SELECT
        price
        , min_price
        --
標準化價格:目標價格減去最小價格
        , price - min_price AS diff
        --
階級範圍:全體價格區間除以階級數
        , 1.0 * range_price / bucket_num AS bucket_range
        --
階級的判定:FLOOR(標準化價格/階級範圍)
        , FLOOR(
            1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
            --
由於index是從1開始所以加上1
        ) + 1 AS bucket
        --
階級的判定:或使用PostgreSQL內建的函數
        -- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
    FROM purchase_detail_log, stats
)
SELECT
    bucket
    --
計算階級的上限與下限
    , min_price + bucket_range * (bucket - 1) AS lower_limit
    , min_price + bucket_range * bucket AS upper_limit
    --
計算次數
    , COUNT(price) AS num_purchase
    --
計算各階級的合計金額
    , SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;
n   手動調整直方圖的上限與下限:
WITH
stats AS (
    SELECT
        --
價格的最大值
        5000 AS max_price
        --
價格的最小值
        , 0 AS min_price
        --
價格區間
        , 5000 AS range_price
        --
階級數
        , 10 AS bucket_num
    FROM purchase_detail_log
)
, purchase_log_with_bucket AS (
    SELECT
        price
        , min_price
        --
標準化價格:目標價格減去最小價格
        , price - min_price AS diff
        --
階級範圍:全體價格區間除以階級數
        , 1.0 * range_price / bucket_num AS bucket_range
        --
階級的判定:FLOOR(標準化價格/階級範圍)
        , FLOOR(
            1.0 * (price - min_price) / (1.0 * range_price / bucket_num)
            --
由於index是從1開始所以加上1
        ) + 1 AS bucket
        --
階級的判定:或使用PostgreSQL內建的函數
        -- , width_bucket(price, min_price, max_price, bucket_num) AS bucket
    FROM purchase_detail_log, stats
)
SELECT
    bucket
    --
計算階級的上限與下限
    , min_price + bucket_range * (bucket - 1) AS lower_limit
    , min_price + bucket_range * bucket AS upper_limit
    --
計算次數
    , COUNT(price) AS num_purchase
    --
計算各階級的合計金額
    , SUM(price) AS total_amount
FROM purchase_log_with_bucket
GROUP BY bucket, min_price, bucket_range
ORDER BY bucket
;

三、彙整第五至八章出現的新語法

彙整第五至八章出現的新語法
l   利用SQL標準中所定義的CUBE語法,就可簡單地考量到所有的組合方式,並彙總數據:
--
使用CUBE列舉action的所有組合方式
SELECT
    has_purchase
    , has_review
    , has_favorite
    , COUNT(1) AS users
FROM user_action_flag
GROUP BY CUBE(has_purchase, has_review, has_favorite)
;
l   十分位分析(Decile Analysis)這種等間距的分組方式,可使用NTILE視窗函數進行:
SELECT
    user_id
    , purchase_amount
    , ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
FROM user_purchase_amount
;
l   利用PERCENT_RANK函數以百分比呈現網頁瀏覽數的排行,PERCENT_RANK的值是用(rank - 1) / (全部數據行數 - 1)所計算出的百分比:
SELECT
    session
    , count
    , RANK() OVER(ORDER BY count DESC) AS rank
    , PERCENT_RANK() OVER(ORDER BY count DESC) AS percent_rank
FROM session_count
;
l   LIKE運算子,利用規則排除數據:
SELECT *
FROM action_log_with_noise
WHERE
    NOT
    --
列舉網路爬蟲的判定條件
    (user_agent LIKE'%bot%'
    OR user_agent LIKE'%crawler%'
    OR user_agent LIKE'%spider%'
    OR user_agent LIKE'%archiver%'
    )
;
l   HAVING句,確認鍵值重複record的數值:
SELECT
    id
    , COUNT(*) AS record_num
    --
將數據彙總至陣列,並以逗點作為區隔符號轉換成字串
    , string_agg(name, ',') AS name_list
    , string_agg(stamp, ',') AS stamp_list
FROM mst_categories
GROUP BY id
--
只保留有重複的ID
HAVING COUNT(*) > 1
;
l   計算標準差、標準分數、偏差值:
SELECT
    subject
    , name
    , score
    --
計算每個學科的標準差
    , stddev_pop(score) OVER(PARTITION BY subject) AS stddev_pop
    --
計算每個學科的平均數
    , AVG(score) OVER(PARTITION BY subject) AS avg_score
    --
計算每個分數的標準分數
    , (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) AS std_value
    --
計算每個分數的偏差值
    , 10.0 * (score - AVG(score) OVER(PARTITION BY subject)) / stddev_pop(score) OVER(PARTITION BY subject) + 50 AS deviation
FROM exam_scores
ORDER BY subject, name
;

沒有留言:

張貼留言