Numpy Tutorial:NDArray, Array indexing, Broadcasting

前回のNumpy tutorialの続きを、このサイトを参考にしながらやる。

スポンサーリンク

NDArray

numpyの基礎構造はndarrayで、vectors(ベクトル), matrices(行列) and higher-dimensional arrays(高次元配列)を表すのに使用される。各ndarrayは以下の属性を有する。

  • dtype = Cのデータ型に対応
  • shape = 配列の次元
  • strides = 配列トラバース時の各方向へのステップに要するバイト数
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%precision 4
plt.style.use('ggplot')
x = np.array([1,2,3,4,5,6])
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)
[1 2 3 4 5 6]
dytpe int64
shape (6,)
strides (8,)
x.shape = (2,3)
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)
[[1 2 3]
 [4 5 6]]
dytpe int64
shape (2, 3)
strides (24, 8)
x = x.astype('complex')
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)
[[1.+0.j 2.+0.j 3.+0.j]
 [4.+0.j 5.+0.j 6.+0.j]]
dytpe complex128
shape (2, 3)
strides (48, 16)

Creating arrays

# from lists
x_list = [(i,j) for i in range(2) for j in range(3)]
print (x_list, '\n')
x_array = np.array(x_list)
print (x_array)
[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)] 

[[0 0]
 [0 1]
 [0 2]
 [1 0]
 [1 1]
 [1 2]]
# Using convenience functions

print (np.ones((3,2)), '\n')
print (np.zeros((3,2)), '\n')
print (np.eye(3), '\n')
print (np.diag([1,2,3]), '\n')
print (np.fromfunction(lambda i, j: (i-2)**2+(j-2)**2, (5,5)))
[[1. 1.]
 [1. 1.]
 [1. 1.]] 

[[0. 0.]
 [0. 0.]
 [0. 0.]] 

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]] 

[[1 0 0]
 [0 2 0]
 [0 0 3]] 

[[8. 5. 4. 5. 8.]
 [5. 2. 1. 2. 5.]
 [4. 1. 0. 1. 4.]
 [5. 2. 1. 2. 5.]
 [8. 5. 4. 5. 8.]]

Array indexing

正規偏差から10✕6配列を作成してints(整数値)に変換する。

# Create a 10 by 6 array from normal deviates and convert to ints
n, nrows, ncols = 100, 10, 6
xs = np.random.normal(n, 15, size=(nrows, ncols)).astype('int')
xs
array([[111,  94, 103, 100, 134, 100],
       [ 91, 112,  86, 113, 101, 114],
       [108,  97, 130, 114,  98,  81],
       [110,  79,  82, 109, 118,  98],
       [ 90,  90,  87,  98,  87,  89],
       [ 90,  95, 102,  77,  81, 104],
       [109,  98,  97, 115, 107, 121],
       [ 75,  95, 110, 123, 115, 114],
       [ 69, 119,  71,  92,  91,  85],
       [ 90,  97,  89,  82,  87,  81]])

スライス表記を使用する。

# Use slice notation
print(xs[0,0])
print(xs[-1,-1])
print(xs[3,:])
print(xs[:,0])
print(xs[::2,::2])
print(xs[2:5,2:5])
111
81
[110  79  82 109 118  98]
[111  91 108 110  90  90 109  75  69  90]
[[111 103 134]
 [108 130  98]
 [ 90  87  87]
 [109  97 107]
 [ 69  71  91]]
[[130 114  98]
 [ 82 109 118]
 [ 87  98  87]]

整数値リストを使ってインデックス化する。

#  Indexing with list of integers
print(xs[0, [1,2,4,5]])
[ 94 103 134 100]
# Boolean indexing
print(xs[xs % 2 == 0])
xs[xs % 2 == 0] = 0 # set even entries to zero
print(xs)
[ 94 100 134 100 112  86 114 108 130 114  98 110  82 118  98  90  90  98
  90 102 104  98 110 114  92  90  82]
[[111   0 103   0   0   0]
 [ 91   0   0 113 101   0]
 [  0  97   0   0   0  81]
 [  0  79   0 109   0   0]
 [  0   0  87   0  87  89]
 [  0  95   0  77  81   0]
 [109   0  97 115 107 121]
 [ 75  95   0 123 115   0]
 [ 69 119  71   0  91  85]
 [  0  97  89   0  87  81]]
# Extracting lower triangular, diagonal and upper triangular matrices

a = np.arange(16).reshape(4,4)
print (a, '\n')
print (np.tril(a, -1), '\n')
print (np.diag(np.diag(a)), '\n')
print (np.triu(a, 1))
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 
] [[ 0 0 0 0] [ 4 0 0 0] [ 8 9 0 0]
] [[ 0 0 0 0] [ 0 5 0 0] [ 0 0 10 0] [ 0 0 0 15]] [[ 0 1 2 3] [ 0 0 6 7] [ 0 0 0 11] [ 0 0 0 0]]

Broadcasting, row, column and matrix operations

行、列、全行列に対する演算

# operations across rows, cols or entire matrix
print(xs.max())
print(xs.max(axis=0)) # max of each col
print(xs.max(axis=1)) # max of each row
123
[111 119 103 123 115 121]
[111 113  97 109  89  95 121 123 119  97]
# A funcitonal rather than object-oriented approacha also wokrs
print(np.max(xs, axis=0))
print(np.max(xs, axis=1))
[111 119 103 123 115 121]
[111 113  97 109  89  95 121 123 119  97]
# broadcasting
xs = np.arange(12).reshape(2,6)
print(xs, '\n')
print(xs * 10, '\n')

# broadcasting just works when doing column-wise operations
col_means = xs.mean(axis=0)
print(col_means, '\n')
print(xs + col_means, '\n')

# but needs a little more work for row-wise operations
row_means = xs.mean(axis=1)[:, np.newaxis]
print(row_means)
print(xs + row_means)
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]] 

[[  0  10  20  30  40  50]
 [ 60  70  80  90 100 110]] 

[3. 4. 5. 6. 7. 8.] 

[[ 3.  5.  7.  9. 11. 13.]
 [ 9. 11. 13. 15. 17. 19.]] 

[[2.5]
 [8.5]]
[[ 2.5  3.5  4.5  5.5  6.5  7.5]
 [14.5 15.5 16.5 17.5 18.5 19.5]]
# convert matrix to have zero mean and unit standard deviation 
# using col summary statistics
print((xs - xs.mean(axis=0))/xs.std(axis=0))
[[-1. -1. -1. -1. -1. -1.]
 [ 1.  1.  1.  1.  1.  1.]]
# convert matrix to have zero mean and unit standard deviation 
# using row summary statistics
print((xs - xs.mean(axis=1)[:, np.newaxis])/xs.std(axis=1)[:, np.newaxis])
[[-1.4639 -0.8783 -0.2928  0.2928  0.8783  1.4639]
 [-1.4639 -0.8783 -0.2928  0.2928  0.8783  1.4639]]
# broadcasting for outer product
# e.g. create the 12x12 multiplication toable
u = np.arange(1, 13)
u[:,None] * u[None,:]
array([[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12],
       [  2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24],
       [  3,   6,   9,  12,  15,  18,  21,  24,  27,  30,  33,  36],
       [  4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48],
       [  5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60],
       [  6,  12,  18,  24,  30,  36,  42,  48,  54,  60,  66,  72],
       [  7,  14,  21,  28,  35,  42,  49,  56,  63,  70,  77,  84],
       [  8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96],
       [  9,  18,  27,  36,  45,  54,  63,  72,  81,  90,  99, 108],
       [ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120],
       [ 11,  22,  33,  44,  55,  66,  77,  88,  99, 110, 121, 132],
       [ 12,  24,  36,  48,  60,  72,  84,  96, 108, 120, 132, 144]])

下記のポイント間のペアワイズ距離行列を計算する。

  • (0,0)
  • (4,0)
  • (4,3)
  • (0,3)
def distance_matrix_py(pts):
    """Returns matrix of pairwise Euclidean distances. Pure Python version."""
    n = len(pts)
    p = len(pts[0])
    m = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            s = 0
            for k in range(p):
                s += (pts[i,k] - pts[j,k])**2
            m[i, j] = s**0.5
    return m
def distance_matrix_np(pts):
    """Returns matrix of pairwise Euclidean distances. Vectorized numpy version."""
    return np.sum((pts[None,:] - pts[:, None])**2, -1)**0.5
pts = np.array([(0,0), (4,0), (4,3), (0,3)])
distance_matrix_py(pts)
array([[0., 4., 5., 3.],
       [4., 0., 3., 5.],
       [5., 3., 0., 4.],
       [3., 5., 4., 0.]])
distance_matrix_np(pts)
array([[0., 4., 5., 3.],
       [4., 0., 3., 5.],
       [5., 3., 0., 4.],
       [3., 5., 4., 0.]])
# Broaccasting and vectorization is faster than looping
%timeit distance_matrix_py(pts)
%timeit distance_matrix_np(pts)
48.8 µs ± 599 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
7.98 µs ± 58.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

python loopよりもnumpy vectorizationの方が処理がかなり高速だ。