2019年1月18日 星期五

Python資料科學學習手冊學習筆記 (2) Matplotlib

  學習了這麼多基礎的Python語言操作,並且學會了基礎的網路擷取技術,但是直到開始閱讀這本書之前,都不算步入資料科學的領域,而是徘徊於基礎的程式語言世界而已,不過儘管如此,應用基礎程式語言替我們帶來的電腦自動化操作,已確實為我們擺脫無聊工作的枷鎖。

  工作上或生活中,難免需要處理多欄位表格資料的清理、篩選、串接、合併、聚合等資訊方面的操作,進而繪製圖表以利掌握數據的整體樣貌,而部落格版主我亦於工作上碰到這方面的問題:使用傳統的程式迴圈雖然也能解決資料處理的問題,但隨著大數據時代的來臨,傳統的程式迴圈顯得極度缺乏效率,版主我曾寫了簡單的三層迴圈以處理工作上臨時性的資料需求,僅僅八千餘筆的資料卻讓該三層迴圈跑了約五分鐘之久才產出結果,萬一程式運作過程出現任何問題,豈不是要重新再耗費一段五分鐘時間?更不用說處理數萬筆、數十萬筆,甚至是真正的大數據時,將耗費極大的時間與運算資源。

  資料科學正是因應大數據與機器學習的崛起,而不斷發光發熱的一門顯學,這本由Jake VanderPlas著作,何敏煌譯著的《Python資料科學學習手冊》(Python Data Science Handbook: Essential Tools for Working with Data),是入門資料科學的良好讀物,作者詳盡說明了NumPyPandasMatplotlibScikit-Learn四大資料科學相關之Python套件的操作方法,並透過各式範例展現這些套件工具的靈活應用性與高效率運算能力。

  本書適合已擁有基礎Python能力的人閱讀,且閱讀本書時可能需要自行整理本書重點,因為本書將各套件功能詳盡說明,編排時不免顯得些許混雜,若沒有自行梳理程式邏輯的習慣,閱讀起來可能會覺得混亂,而難以駕馭本書所介紹的各項功能強大且好用的程式工具。

第四章:使用Matplotlib進行視覺化

Matplotlib繪圖環境
l   在腳本中繪圖:
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x))
plt.show()
l   IPython Shell中繪圖:
%matplotlib
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x))
# plt.draw()
用以強制更新圖表
l   Jupyter Notebook中繪圖:
# %matplotlib notebook
會讓互動式圖形嵌在Notebook
# %matplotlib inline
會讓靜態圖形嵌在Notebook
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x))

Matplotlib繪圖介面
l   MATLAB型式介面:
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 100)
# plt.subplot(rows, columns, panels)
plt.subplot(2, 1, 1)
plt.plot(x, np.sin(x))
plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x))
plt.show()
l   物件導向式介面:
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0, 10, 100)
fig, ax = plt.subplots(2)
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))
plt.show()
l   由表格直接繪圖:
import pandas as pd
import matplotlib.pyplot as plt
# https://github.com/jakevdp/PythonDataScienceHandbook/tree/master/notebooks/data
birthsData = pd.read_csv("births.csv")
birthsData.pivot_table("births", index = "year", columns = "gender", aggfunc = "sum").plot()
plt.show()

Matplotlib繪圖基礎
l   繪圖介面設定:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
設定繪圖樣式
plt.style.use("seaborn-whitegrid")

#
使用MATLAB型式介面
plt.plot(x, np.sin(x))
#
儲存圖形
plt.savefig("sinFigure.png")

#
使用物件導向式介面
#
建立繪圖容器fig與繪圖格線ax
fig = plt.figure()
ax = plt.axes()
ax.plot(x, np.cos(x))
#
儲存圖形
fig.savefig("cosFigure.png")
#
支援儲存圖形的檔案格式
print(fig.canvas.get_supported_filetypes())
l   圖表標籤設定:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
使用MATLAB型式介面
plt.plot(x, np.sin(x), label = "sin()")
plt.plot(x, np.cos(x), label = "cos()")
plt.title("Sine & Cosine Curve")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()

#
使用物件導向式介面
fig = plt.figure()
ax = plt.axes()
ax.plot(x, np.sin(x), label = "sin()")
ax.plot(x, np.cos(x), label = "cos()")
ax.set(title = "Sine & Cosine Curve", xlabel = "x", ylabel = "y")
ax.legend()

plt.show()
l   圖表XY軸設定:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
使用MATLAB型式介面
plt.plot(x, np.sin(x))
plt.xlim(-1, 11)
plt.ylim(-1.5, 1.5)
# plt.axis([-1, 11, -1.5, 1.5]) XY
軸範圍設定
# plt.axis("equal") XY
軸範圍設定比例相同
# plt.axis("tight")
圖形自動變緊密
# plt.grid(True)
設定格線

#
使用物件導向式介面
fig = plt.figure(figsize = (6.25, 4.75))
ax = plt.axes()
ax.plot(x, np.sin(x))
ax.set(xlim = (-1, 11), ylim = (-1.5, 1.5))
# ax.axis([-1, 11, -1.5, 1.5]) XY
軸範圍設定
# ax.axis("equal") XY
軸範圍設定比例相同
# ax.axis("tight")
圖形自動變緊密
# ax.grid(True)
設定格線

plt.show()
l   線條樣式設定:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
使用MATLAB型式介面
plt.plot(x, np.sin(x), color = "blue", linestyle = "-")
plt.plot(x, np.cos(x), "g--")

#
使用物件導向式介面
fig = plt.figure()
ax = plt.axes()
ax.plot(x, np.sin(x), linestyle = "-.", color = "#FFDD44")
ax.plot(x, np.cos(x), ":b")

plt.show()
l   圖例樣式設定:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
使用MATLAB型式介面
plt.plot(x, np.sin(x), label = "sin()")
plt.plot(x, np.cos(x), label = "cos()")
plt.legend(frameon = False, loc = "lower center", ncol = 2)

#
使用物件導向式介面
fig = plt.figure()
ax = plt.axes()
ax.plot(x, np.sin(x), label = "sin()")
ax.plot(x, np.cos(x), label = "cos()")
ax.legend(fancybox = True, framealpha = 1, shadow = True, borderpad = 1)

plt.show()

Matplotlib圖表類型
l   折線圖:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 20)
y = np.sin(x)

plt.plot(x, y, marker = "o")

plt.show()

l   散佈圖-使用pyplot.plot()繪製:
import numpy as np
import matplotlib.pyplot as plt

for marker in ["o", ".", ",", "x", "+", "v", "^", "<", ">", "s", "d"]:
    plt.plot(np.random.rand(3), np.random.rand(3), marker, label = "Marker: {0}".format(marker))
plt.legend()
plt.xlim(0, 1.4)

plt.show()

l   散佈圖-使用pyplot.scatter()繪製:
pyplot.scatter()
每一個點的屬性都可以個別控制或是對應到資料上,因此效能上pyplot.scatter()會比pyplot.plot()來得差:
import numpy as np
import matplotlib.pyplot as plt

rng = np.random.RandomState(0)
x = rng.rand(100)
y = rng.rand(100)

colors = rng.rand(100)
sizes = 1000 * rng.rand(100)
plt.scatter(x, y, c = colors, sizes = sizes, alpha = 0.3)
plt.colorbar()

plt.show()

l   誤差圖:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 20)
dy = 0.5
y = np.sin(x) + dy * np.random.rand(20)

plt.errorbar(x, y, yerr = dy, fmt = "o")

plt.show()

l   等高線圖:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 5, 50)
y = np.linspace(0, 5, 40)
X, Y = np.meshgrid(x, y)
Z = np.sin(X) ** 10 + np.cos(10 + Y * X) * np.cos(X)

#
等高線圖
plt.contour(X, Y, Z)
plt.colorbar()
"""
#
等高線圖填色
plt.contourf(X, Y, Z, levels = 50, cmap = "RdGy")
plt.colorbar()

#
等高線圖影像,不接受XY格點,原點從預設左上角改為左下角
plt.imshow(Z, extent = [0, 5, 0, 5], origin = "lower", cmap = "RdGy")
plt.colorbar()

#
等高線標籤
contours = plt.contour(X, Y, Z, colors = "black")
plt.clabel(contours, fontsize = 8)
plt.imshow(Z, extent = [0, 5, 0, 5], origin = "lower", cmap = "RdGy", alpha = 0.5)
plt.colorbar()
"""
plt.show()

l   直方圖:
import numpy as np
import matplotlib.pyplot as plt

aData = np.random.normal(1, 2, 1000)
bData = np.random.randn(1000)

# density
引數為True表示顯示模式為機率密度
kwargs = dict(bins = 25, alpha = 0.3, density = True)
plt.hist(aData, **kwargs, label = "aData")
plt.hist(bData, **kwargs, label = "bData")
plt.legend()

plt.show()

l   直方圖-使用numpy.histogram()計算,不繪圖:
import numpy as np

aData = np.random.normal(1, 2, 1000)
bData = np.random.randn(1000)

aCounts, aBinEdges = np.histogram(aData, bins = 5)
bCounts, bBinEdges = np.histogram(bData, bins = 5)

print(aCounts, aBinEdges)
print(bCounts, bBinEdges)
l   二維直方圖:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
x, y = np.random.multivariate_normal(mean = [0, 0], cov = [[1, 1], [1, 2]], size = 10000).T

plt.hist2d(x, y, bins = 50, cmap = "Blues")
# plt.hexbin(x, y, gridsize = 50, cmap = "Blues")
plt.colorbar(label = "Counts in Bin")

plt.show()

l   二維直方圖-使用numpy.histogram2d()計算,不繪圖:
import numpy as np

x, y = np.random.multivariate_normal(mean = [0, 0], cov = [[1, 1], [1, 2]], size = 10000).T

counts, xBinEdges, yBinEdges = np.histogram2d(x, y, bins = 5)

print(counts, xBinEdges, yBinEdges)

Matplotlib繪圖技巧-圖例與色彩條
l   選取圖例所要使用的元素:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)
y = np.sin(x[:, np.newaxis] + np.pi * np.arange(0, 2, 0.5))

#
方法一
lines = plt.plot(x, y)
plt.legend(lines[:2], ["First", "Second"])
"""
#
方法二
plt.plot(x, y[:, 0], label = "First")
plt.plot(x, y[:, 1], label = "Second")
plt.plot(x, y[:, 2:])
plt.legend()
"""
plt.show()

l   多重圖例:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.legend import Legend

x = np.linspace(0, 10, 100)
y = np.sin(x[:, np.newaxis] + np.pi * np.arange(0, 2, 0.5))

ax = plt.axes()
lines = ax.plot(x, y)
#
第一個圖例
ax.legend(lines[:2], ["First", "Second"], loc = "upper right")
#
第二個圖例
leg = Legend(ax, lines[2:], ["Third", "Fourth"], loc = "lower right")
ax.add_artist(leg)

plt.show()

l   設定圖例所要使用的資料點大小:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# https://github.com/jakevdp/PythonDataScienceHandbook/tree/master/notebooks/data
cities = pd.read_csv("California_cities.csv")
lat, lon = cities["latd"], cities["longd"]
population, area = cities["population_total"], cities["area_total_km2"]

plt.scatter(lon, lat, label = None, c = np.log10(population), s = area, linewidth = 0, alpha = 0.5)
plt.title("California Cities: Area and Population")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis(aspect = "equal")
plt.colorbar(label = "log$_{10}$(population)", extend = "both")
plt.clim(3, 7)

#
利用XY軸空串列畫上想要的圖例資料點大小
for size in [100, 300, 500]:
    plt.scatter([], [], label = str(size) + "km$^2$", c = "gray", s = size, linewidth = 0, alpha = 0.5)
plt.legend(frameon = False, labelspacing = 1, title = "City Area")

plt.show()

l   設定連續或離散的色彩條:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 1000)
data = np.sin(x) * np.cos(x[:, np.newaxis])

#
連續的色彩條
plt.imshow(data, origin = "lower")
plt.colorbar()
"""
#
離散的色彩條
plt.imshow(data, origin = "lower", cmap = plt.cm.get_cmap("Blues", 6))
plt.colorbar()
"""
plt.show()


Matplotlib繪圖技巧-標註符號
l   標註點線符號:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

plt.plot(x, np.sin(x))

#
標註點
plt.plot(np.pi, np.sin(np.pi), "o")
#
標註垂直線
plt.axvline(np.pi, alpha = 0.25, color = "red")
#
標註水平線
plt.axhline(np.sin(np.pi), alpha = 0.25, color = "blue")
#
標註文字
plt.text(np.pi, -0.125, "$x = \pi$", ha = "center")

plt.show()

l   標註箭頭符號:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

plt.plot(x, np.sin(x))
plt.axis("equal")

#
標註箭頭與文字
plt.annotate("Maximum", xy = (np.pi / 2, 1), xytext = (2.57, 2), arrowprops = dict(facecolor = "black", shrink = 0.05))
plt.annotate("Minimum", xy = (np.pi / 2 * 3, -1), xytext = (2, -2), arrowprops = dict(arrowstyle = "->", connectionstyle = "angle3, angleA = 0, angleB = -90"))

plt.show()

l   標註座標轉換:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(facecolor = "lightgray")
ax.axis([0, 3, 0, 3])

#
預設的轉換是ax.transData,是資料座標相關的轉換
ax.text(0.2, 0.2, ". Data: (0.2, 0.2)")
# ax.transAxes
axes座標相關的轉換
ax.text(0.2, 0.2, ". Axes: (0.2, 0.2)", transform = ax.transAxes)
# fig.transFigure
figure座標相關的轉換
ax.text(0.2, 0.2, ". Figure: (0.2, 0.2)", transform = fig.transFigure)

plt.show()


Matplotlib繪圖技巧-多重圖表
l   母子圖表:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

aAx = plt.axes()
# [bottom, left, width, height]
數值為子圖表在母圖表的位置比例
bAx = plt.axes([0.65, 0.65, 0.2, 0.2])

aAx.plot(x, np.sin(x))
bAx.plot(x, np.cos(x))

plt.show()

l   並列圖表:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

#
方法一
aAx = plt.axes([0.1, 0.5, 0.8, 0.4], xticklabels = [], ylim = (-1.2, 1.2))
bAx = plt.axes([0.1, 0.1, 0.8, 0.4], ylim = (-1.2, 1.2))
"""
#
方法二
fig = plt.figure()
aAx = fig.add_axes([0.1, 0.5, 0.8, 0.4], xticklabels = [], ylim = (-1.2, 1.2))
bAx = fig.add_axes([0.1, 0.1, 0.8, 0.4], ylim = (-1.2, 1.2))
"""
aAx.plot(x, np.sin(x))
bAx.plot(x, np.cos(x))

plt.show()

l   逐次建立網格圖表:
import numpy as np
import matplotlib.pyplot as plt

fig = plt.figure()
#
調整圖表之間的間隙
fig.subplots_adjust(hspace = 0.4, wspace = 0.4)

#
方法一
for i in range(1, 7):
    plt.subplot(2, 3, i)
    plt.text(0.5, 0.5, str((2, 3, i)), fontsize = 18, ha = "center")
"""
#
方法二
for i in range(1, 7):
    ax = fig.add_subplot(2, 3, i)
    ax.text(0.5, 0.5, str((2, 3, i)), fontsize = 18, ha = "center")
"""
plt.show()

l   一次建立網格圖表:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(2, 3, sharex = "col", sharey = "row")

for i in range(2):
    for j in range(3):
        ax[i, j].text(0.5, 0.5, str((2, 3, i * 3 + j + 1)), fontsize = 18, ha = "center")

plt.show()

l   建立複雜網格圖表:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

grid = plt.GridSpec(2, 3, wspace = 0.5, hspace = 0.25)
aAx = plt.subplot(grid[0, 0])
bAx = plt.subplot(grid[0, 1:])
cAx = plt.subplot(grid[1, :2])
dAx = plt.subplot(grid[1, 2])

aAx.plot(x, np.sin(x))
bAx.plot(x, np.cos(x))
cAx.plot(x, np.tan(x))
dAx.plot(np.tan(x), x)

plt.show()

l   使用網格圖表建立多軸直方圖:
import numpy as np
import matplotlib.pyplot as plt

x, y = np.random.multivariate_normal(mean = [0, 0], cov = [[1, 1], [1, 2]], size = 10000).T

grid = plt.GridSpec(4, 4, wspace = 0.4, hspace = 0.4)
mainAx = plt.subplot(grid[:-1, 1:])
yHist = plt.subplot(grid[:-1, 0], xticklabels = [], sharey = mainAx)
xHist = plt.subplot(grid[-1, 1:], yticklabels = [], sharex = mainAx)

mainAx.plot(x, y, "o", markersize = 2, alpha = 0.2)
xHist.hist(x, bins = 40, orientation = "vertical")
xHist.invert_yaxis()
yHist.hist(y, bins = 40, orientation = "horizontal")
yHist.invert_xaxis()

plt.show()


Matplotlib繪圖技巧-標籤刻度
l   特殊標籤與刻度,例如自然對數:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(np.e, np.e ** 3, 100)

ax = plt.axes(xscale = "log")
ax.plot(x, np.log(x))

plt.show()

l   使用pyplot.NullLocator()隱藏標籤與刻度:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

ax = plt.axes()
ax.plot(x, np.sin(x))

#
隱藏X軸主要格線的標籤與刻度
ax.xaxis.set_major_locator(plt.NullLocator())
#
僅隱藏X軸主要格線的標籤
ax.xaxis.set_major_formatter(plt.NullFormatter())

plt.show()

l   使用pyplot.MaxNLocator()指定標籤與刻度數量:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(4, 4, sharex = True, sharey = True)

for iAx in ax.flat:
    #
指定X軸主要格線的標籤與刻度數量
    iAx.xaxis.set_major_locator(plt.MaxNLocator(3))
    #
指定Y軸主要格線的標籤與刻度數量
    iAx.yaxis.set_major_locator(plt.MaxNLocator(3))

plt.show()

l   使用pyplot.MultipleLocator()設定標籤與刻度間距、
使用pyplot.FuncFormatter()重設標籤:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 3 * np.pi, 1000)

ax = plt.axes()
ax.plot(x, np.sin(x))
ax.axis("equal")
ax.grid(True)

#
設定X軸主要格線的標籤與刻度間距
ax.xaxis.set_major_locator(plt.MultipleLocator(np.pi / 2))
#
設定X軸次要格線的標籤與刻度間距
ax.xaxis.set_minor_locator(plt.MultipleLocator(np.pi / 4))

def formatFunc(value, tickNum):
    num = int(value / np.pi * 2)
    return r"{0}$\pi/2$".format(num)
#
重設X軸主要格線的標籤
ax.xaxis.set_major_formatter(plt.FuncFormatter(formatFunc))

plt.show()

l   重設標籤與標籤樣式:
import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0, 10, 100)

ax = plt.axes()
ax.plot(x, np.sin(x))

#
修改全部X軸文字
ax.set_xticklabels([None, 0, "x = 2", "x = 4", 6, 8, 10])
#
修改部分X軸文字樣式
ax.get_xticklabels()[2].set(weight = "heavy", color = "red")
ax.get_xticklabels()[3].set(weight = "heavy", color = "red")

plt.show()


Matplotlib繪圖技巧-繪圖樣式
l   繪圖樣式表:
import matplotlib.pyplot as plt
print(plt.style.available)

# FiveThirtyEight
樣式:"fivethirtyeight"
# Ggplot
樣式:"ggplot"
# Bayesian Methods for Hackers
樣式:"bmh"
#
深色背景樣式:"dark_background"
#
灰階樣式:"grayscale"
# Seaborn
樣式:"seaborn"
l   設定繪圖樣式:
import numpy as np
import matplotlib.pyplot as plt

def histAndLines(style):
    np.random.seed(0)
    fig, ax = plt.subplots(1, 2, figsize = (11, 4))
    ax[0].hist(np.random.randn(1000))
    ax[0].set(title = str(style))
    for i in range(3):
        ax[1].plot(np.random.rand(10))
    ax[1].legend(["First", "Second", "Third"])
    ax[1].set(title = str(style))

style = "fivethirtyeight"

#
方法一
plt.style.use(style)
histAndLines(style)
"""
#
方法二
with plt.style.context(style):
    histAndLines(style)
"""
plt.show()

l   自訂圖表的繪圖樣式:
import numpy as np
import matplotlib.pyplot as plt

x = np.random.randn(1000)

#
灰色背景
ax = plt.axes(facecolor = "#E6E6E6")
# X
軸刻度下移一層
ax.set_axisbelow(True)
#
白色實心格線
ax.grid(color = "w", linestyle = "solid")
#
隱藏邊線
for spine in ax.spines.values():
    spine.set_visible(False)
#
隱藏上面和右側的刻度
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()
#
調整刻度和標籤顏色,並調整刻度方向
ax.tick_params(colors = "gray", direction = "out")

#
橘紅色直方圖
ax.hist(x, color = "#EE6666")

plt.show()
l   變更預設的繪圖樣式:
import numpy as np
import matplotlib.pyplot as plt

defaultStyle = plt.rcParams.copy()

x = np.random.randn(1000)

#
橘紅色與其它預設色彩
from matplotlib import cycler
colors = cycler("color", ["#EE6666", "#3388BB", "#9988DD", "#EECC55", "#88BB44", "#FFBBBB"])
#
灰色背景
# X
軸刻度下移一層
#
隱藏邊線,亦隱藏上面和右側的刻度
plt.rc("axes", facecolor = "#E6E6E6", axisbelow = True, edgecolor = "none", grid = True, prop_cycle = colors)
#
白色實心格線
plt.rc("grid", color = "w", linestyle = "solid")
#
調整刻度和標籤顏色,並調整刻度方向
plt.rc("xtick", direction = "out", color = "gray")
plt.rc("ytick", direction = "out", color = "gray")

#
重置rcParams
# plt.rcParams.update(defaultStyle)

#
直方圖
plt.hist(x)

plt.show()

Matplotlib三維圖表
l   三維的折線圖:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

z = np.linspace(0, 10, 100)
x = np.sin(z)
y = np.cos(z)

ax = plt.axes(projection = "3d")
ax.plot3D(x, y, z)

plt.show()
l   三維的散佈圖:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

theta = 2 * np.pi * np.random.rand(1000)
r = 6 * np.random.rand(1000)
x = np.ravel(r * np.sin(theta))
y = np.ravel(r * np.cos(theta))
z = np.sin(np.sqrt(x ** 2 + y ** 2))

ax = plt.axes(projection = "3d")
#
一般散佈圖
ax.scatter3D(x, y, z)
#
在鄰近點之間形成的三角形建立平面
# ax.plot_trisurf(x, y, z)

plt.show()
l   三維的等高線圖:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

x = np.linspace(-6, 6, 30)
y = np.linspace(-6, 6, 30)
X, Y = np.meshgrid(x, y)
Z = np.sin(np.sqrt(X ** 2 + Y ** 2))

ax = plt.axes(projection = "3d")
#
調整視角,XY平面上60度,Z軸逆時針35度方位角
ax.view_init(60, 35)
#
一般等高線圖
ax.contour3D(X, Y, Z, levels = 50)
#
線框圖
# ax.plot_wireframe(X, Y, Z)
#
表面圖
# ax.plot_surface(X, Y, Z)

plt.show()

Matplotlib地圖圖表
l   安裝Matplotlib地理資料工具包Basemap
因為此工具包所需之執行環境套件多且不易安裝,建議在Anaconda Prompt輸入conda install basemap指令,即可自動安裝完成。
l   基本地圖圖表:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

plt.figure(figsize = (8, 8))
# projection
引數為地圖投影法
# resolution
引數為地圖細節程度
# lat_0
引數設定地圖中間的緯度
# lon_0
引數設定地圖中間的經度
m = Basemap(projection = "lcc", resolution = None, lat_0 = 45, lon_0 = -100, width = 8E6, height = 8E6)
#
使用Basemap繪製地圖圖表的方法
m.etopo(scale = 0.5, alpha = 0.5)

#
對應(longitude, latitude)(x, y)以繪製標示點與文字
x, y = m(-122.3, 47.6)
plt.plot(x, y, "ok", markersize = 5)
plt.text(x, y, " Seattle", fontsize = 12)

plt.show()
l   地圖投影法範例:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from itertools import chain

def drawMap(m, scale = 0.2):
    m.shadedrelief(scale = scale)
    #
以字典型態傳回latslons
    lats = m.drawparallels(np.linspace(-90, 90, 13))
    lons = m.drawmeridians(np.linspace(-180, 180, 13))
    #
選擇出pyplot.Line2D的執行實例傳回latLineslonLines
    latLines = chain(*(tup[1][0] for tup in lats.items()))
    lonLines = chain(*(tup[1][0] for tup in lons.items()))
    allLines = chain(latLines, lonLines)
    #
重複執行所有的經緯線,並設定想要的樣式
    for line in allLines:
        line.set(linestyle = "-", alpha = 0.3, color = "w")

plt.figure(figsize = (8, 8))

#
圓柱投影法(緯度lat、經度lon、下半角落llcrnr、上半角落urcrnr)
m = Basemap(projection = "cyl", resolution = None, llcrnrlat = -90, urcrnrlat = 90, llcrnrlon = -180, urcrnrlon = 180)
"""
#
摩爾魏投影法
m = Basemap(projection = "moll", resolution = None, lat_0 = 0, lon_0 = 0)

#
正交投影法
m = Basemap(projection = "ortho", resolution = None, lat_0 = 50, lon_0 = 0)

#
藍伯特正圓錐投影法(兩條平行的緯線lat_1lat_2)
m = Basemap(projection = "lcc", resolution = None, lon_0 = 0, lat_0 = 50, lat_1 = 45, lat_2 = 55, width = 1.6E7, height = 1.2E7)
"""
drawMap(m)

plt.show()
l   地圖細節程度範例:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

fig, ax = plt.subplots(1, 2, figsize = (12, 8))
for ind, res in enumerate(["c", "l"]):
    m = Basemap(projection = "gnom", lat_0 = 57.3, lon_0 = -6.2, width = 2E6, height = 2E6, resolution = res, ax = ax[ind])
    m.fillcontinents(color = "#FFDDCC", lake_color = "#DDEEFF")
    m.drawmapboundary(fill_color = "#DDEEFF")
    m.drawcoastlines()
    ax[ind].set_title("resolution = '{0}'".format(res))

plt.show()

使用Seaborn進行視覺化
l   使用Seaborn樣式:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

x = np.linspace(0, 10, 500)
y = np.cumsum(np.random.randn(500, 3), axis = 0)

#
使用Seaborn樣式,亦可使用with sns.axes_style("white"):
sns.set()
plt.plot(x, y)
plt.legend(["First", "Second", "Third"])

plt.show()
l   核密度估計圖(Kernel Density Estimation, KDE)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(0)
data = np.random.multivariate_normal(mean = [0, 0], cov = [[5, 2], [2, 2]], size = 2000)
data = pd.DataFrame(data, columns = ["x", "y"])

#
一維KDE和直方圖
sns.distplot(data["x"])
sns.distplot(data["y"])
"""
#
一維KDE
sns.kdeplot(data["x"], shade = True)
sns.kdeplot(data["y"], shade = True)

#
二維KDE
sns.kdeplot(data["x"], data["y"])
"""
plt.show()

l   直條圖(Cat Plot, kind = "count")
import matplotlib.pyplot as plt
import seaborn as sns

planets = sns.load_dataset("planets")

grid = sns.catplot("year", data = planets, kind = "count", hue = "method", order = range(2011, 2015))
grid.set_ylabels("Number of Planets Discovered")

plt.show()

l   盒鬚圖(Cat Plot, kind = "box")
import matplotlib.pyplot as plt
import seaborn as sns

tips = sns.load_dataset("tips")
tips["tip_pct"] = 100 * tips["tip"] / tips["total_bill"]

grid = sns.catplot("day", "total_bill", "sex", data = tips, kind = "box")
grid.set_axis_labels("Day", "Total Bill")

plt.show()

l   小提琴圖(Violin Plot)
import matplotlib.pyplot as plt
import seaborn as sns

tips = sns.load_dataset("tips")
tips["tip_pct"] = 100 * tips["tip"] / tips["total_bill"]

sns.violinplot("sex", "tip_pct", data = tips, palette = ["lightblue", "lightpink"])

plt.show()

l   回歸線圖(Linear Model Plot)
import matplotlib.pyplot as plt
import seaborn as sns

tips = sns.load_dataset("tips")
tips["tip_pct"] = 100 * tips["tip"] / tips["total_bill"]

sns.lmplot("tip", "total_bill", col = "sex", data = tips, markers = ".", scatter_kws = dict(color = "c"))

plt.show()

l   聯合分佈圖(Joint Plot)
import matplotlib.pyplot as plt
import seaborn as sns

tips = sns.load_dataset("tips")
tips["tip_pct"] = 100 * tips["tip"] / tips["total_bill"]

sns.jointplot("total_bill", "tip", data = tips, kind = "reg")
# sns.jointplot("total_bill", "tip", data = tips, kind = "kde")

plt.show()

l   多面向直方圖(Faceted Histogram)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

tips = sns.load_dataset("tips")
tips["tip_pct"] = 100 * tips["tip"] / tips["total_bill"]

grid = sns.FacetGrid(data = tips, row = "sex", col = "time", margin_titles = True)
grid.map(plt.hist, "tip_pct", bins = np.linspace(0, 40, 15))

plt.show()

l   成對圖表(Pair Plots)
import matplotlib.pyplot as plt
import seaborn as sns

iris = sns.load_dataset("iris")

#
方法一
sns.pairplot(iris, hue = "species")
"""
#
方法二
grid = sns.PairGrid(iris, hue = "species")
grid.map(plt.scatter)
grid.add_legend()
"""
plt.show()


整合NumpyPandasMatplotlibSeaborn實作範例
l   清理與篩選資料,視覺化美國每日平均出生數:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# https://github.com/jakevdp/PythonDataScienceHandbook/tree/master/notebooks/data
birthsData = pd.read_csv("births.csv")

#
清理資料,使用sigma-clipping算法
quartiles = np.percentile(birthsData["births"], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
#
穩健評估樣本平均,0.74常數來自於高斯分布的四分位距
cleanBirthsData = birthsData.query("(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)")
#
設定日期索引
cleanBirthsData.index = pd.to_datetime(10000 * cleanBirthsData["year"] + 100 * cleanBirthsData["month"] + cleanBirthsData["day"], format = "%Y%m%d")

#
篩選資料
birthsByDate = cleanBirthsData.pivot_table("births", [cleanBirthsData.index.month, cleanBirthsData.index.day])
#
設定日期索引,因為資料包含229日,暫時選定一個閏年使用
birthsByDate.index = [pd.datetime(2016, month, day) for (month, day) in birthsByDate.index]

fig, ax = plt.subplots(figsize = (12, 4))
birthsByDate.plot(ax = ax)
#
設定圖標
ax.set(title = "USA Births by Day of Year (1969-1988)", ylabel = "Average Daily Births")
#
重新設定日期標籤,隱藏剛才暫時選定的閏年
ax.xaxis.set_major_locator(plt.NullLocator())
ax.xaxis.set_minor_locator(plt.NullLocator())
ax.xaxis.set_major_locator(mpl.dates.MonthLocator(bymonthday = 1))
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter("%b"))

plt.show()
l   依工作日與週末,視覺化西雅圖每小時平均自行車流量:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# https://data.seattle.gov/Transportation/Fremont-Bridge-Hourly-Bicycle-Counts-by-Month-Octo/65db-xm6k
data = pd.read_csv("Fremont_Bridge.csv", index_col = "Date", parse_dates = True)
data.columns = ["East", "West"]
data["Total"] = data["East"] + data["West"]

weekArray = np.where(data.index.weekday < 5, "Weekday", "Weekend")
byTime = data.groupby([weekArray, data.index.time]).mean()

fig, ax = plt.subplots(1, 2, figsize = (14, 5))
byTime.loc["Weekday"].plot(ax = ax[0], title = "Weekdays", xticks = 60 * 60 * 4 * np.arange(6), style = [":", "--", "-"])
byTime.loc["Weekend"].plot(ax = ax[1], title = "Weekends", xticks = 60 * 60 * 4 * np.arange(6), style = [":", "--", "-"])

plt.show()
l   探索馬拉松完成時間:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def convertTime(obj):
    return pd.to_timedelta(obj)
# https://github.com/jakevdp/marathon-data
#
透過converters引數轉換"split""final"兩欄資料為時間型態
data = pd.read_csv("marathon-data.csv", converters = {"split": convertTime, "final": convertTime})

#
提供以秒為單位的時間
data["splitSec"] = data["split"].astype("timedelta64[s]")
data["finalSec"] = data["final"].astype("timedelta64[s]")
#
半程分率(Split Fraction)是負的為後段加速,後段加速表示馬拉松後半段跑得比前半段快
data["splitFrac"] = 1 - 2 * data["splitSec"] / data["finalSec"]
#
以十年分隔參賽者年齡
data["ageDec"] = data.age.map(lambda age: 10 * (age // 10))

#
聯合分佈圖(Joint Plot),對比前半段與全程馬拉松時間
grid = sns.jointplot("splitSec", "finalSec", data = data, kind = "hex")
grid.ax_joint.plot(np.linspace(4000, 16000), np.linspace(8000, 32000), ":k")
"""
#
成對圖表(Pair Plots),對比各欄位資料的相關性
grid = sns.PairGrid(data, vars = ["age", "splitSec", "finalSec", "splitFrac"], hue = "gender")
grid.map(plt.scatter, alpha = 0.5)
grid.add_legend()

#
一維KDE和直方圖,畫出半程分率(Split Fraction)為零的界線
grid = sns.distplot(data["splitFrac"])
grid.axvline(0, linestyle = ":", color = "k")

#
一維KDE圖,對比男性和女性半程分率的差異
sns.kdeplot(data.splitFrac[data.gender == "M"], label = "Men", shade = True)
sns.kdeplot(data.splitFrac[data.gender == "W"], label = "Women", shade = True)
plt.xlabel("splitFrac")

#
小提琴圖(Violin Plot),對比男性和女性不同年齡層半程分率的差異
sns.violinplot("ageDec", "splitFrac", hue = "gender", data = data, split = True, inner = "quartile", palette = ["lightblue", "lightpink"])

#
回歸線圖(Linear Model Plot),畫出半程分率與全程馬拉松時間的關係
sns.lmplot("finalSec", "splitFrac", col = "gender", data = data, markers = ".", scatter_kws = dict(color = "c"))
"""
plt.show()

沒有留言:

張貼留言