# Numpy Tutorial：NDArray, Array indexing, Broadcasting

スポンサーリンク

## NDArray¶

numpyの基礎構造はndarrayで、vectors(ベクトル), matrices(行列) and higher-dimensional arrays(高次元配列)を表すのに使用される。各ndarrayは以下の属性を有する。

• dtype = Cのデータ型に対応
• shape = 配列の次元
• strides = 配列トラバース時の各方向へのステップに要するバイト数
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%precision 4
plt.style.use('ggplot')

x = np.array([1,2,3,4,5,6])
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)

1
dytpe int64
shape (6,)
strides (8,)

x.shape = (2,3)
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)

[2
[4 5 6]]
dytpe int64
shape (2, 3)
strides (24, 8)

x = x.astype('complex')
print (x)
print ('dytpe', x.dtype)
print ('shape', x.shape)
print ('strides', x.strides)

[3
[4.+0.j 5.+0.j 6.+0.j]]
dytpe complex128
shape (2, 3)
strides (48, 16)

スポンサーリンク

## Creating arrays¶

# from lists
x_list = [(i,j) for i in range(2) for j in range(3)]
print (x_list, '\n')
x_array = np.array(x_list)
print (x_array)

[(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]

[[0 0]
[0 1]
[0 2]
4
5
6]

# Using convenience functions

print (np.ones((3,2)), '\n')
print (np.zeros((3,2)), '\n')
print (np.eye(3), '\n')
print (np.diag([1,2,3]), '\n')
print (np.fromfunction(lambda i, j: (i-2)**2+(j-2)**2, (5,5)))

[7
8
9]

[[0. 0.]
[0. 0.]
[0. 0.]]

[10
[0. 1. 0.]
[0. 0. 1.]]

[11
[0 2 0]
[0 0 3]]

[[8. 5. 4. 5. 8.]
[5. 2. 1. 2. 5.]
[4. 1. 0. 1. 4.]
[5. 2. 1. 2. 5.]
[8. 5. 4. 5. 8.]]

スポンサーリンク

## Array indexing¶

# Create a 10 by 6 array from normal deviates and convert to ints
n, nrows, ncols = 100, 10, 6
xs = np.random.normal(n, 15, size=(nrows, ncols)).astype('int')
xs

array([[111,  94, 103, 100, 134, 100],
[ 91, 112,  86, 113, 101, 114],
[108,  97, 130, 114,  98,  81],
[110,  79,  82, 109, 118,  98],
[ 90,  90,  87,  98,  87,  89],
[ 90,  95, 102,  77,  81, 104],
[109,  98,  97, 115, 107, 121],
[ 75,  95, 110, 123, 115, 114],
[ 69, 119,  71,  92,  91,  85],
[ 90,  97,  89,  82,  87,  81]])

スライス表記を使用する。

# Use slice notation
print(xs[0,0])
print(xs[-1,-1])
print(xs[3,:])
print(xs[:,0])
print(xs[::2,::2])
print(xs[2:5,2:5])

111
81
[110  79  82 109 118  98]
[111  91 108 110  90  90 109  75  69  90]
[[111 103 134]
[108 130  98]
[ 90  87  87]
[109  97 107]
[ 69  71  91]]
[[130 114  98]
[ 82 109 118]
[ 87  98  87]]


#  Indexing with list of integers
print(xs[0, [1,2,4,5]])

[ 94 103 134 100]

# Boolean indexing
print(xs[xs % 2 == 0])
xs[xs % 2 == 0] = 0 # set even entries to zero
print(xs)

[ 94 100 134 100 112  86 114 108 130 114  98 110  82 118  98  90  90  98
90 102 104  98 110 114  92  90  82]
[[111   0 103   0   0   0]
[ 91   0   0 113 101   0]
[  0  97   0   0   0  81]
[  0  79   0 109   0   0]
[  0   0  87   0  87  89]
[  0  95   0  77  81   0]
[109   0  97 115 107 121]
[ 75  95   0 123 115   0]
[ 69 119  71   0  91  85]
[  0  97  89   0  87  81]]

# Extracting lower triangular, diagonal and upper triangular matrices

a = np.arange(16).reshape(4,4)
print (a, '\n')
print (np.tril(a, -1), '\n')
print (np.diag(np.diag(a)), '\n')
print (np.triu(a, 1))

[[ 0  1  2  3]
[ 4  5  6  7]
[ 8  9 10 11]
[12 13 14 15]]

[[ 0  0  0  0]
[ 4  0  0  0]
[ 8  9  0  0]
[12 13 14  0]]

[[ 0  0  0  0]
[ 0  5  0  0]
[ 0  0 10  0]
[ 0  0  0 15]]

[[ 0  1  2  3]
[ 0  0  6  7]
[ 0  0  0 11]
[ 0  0  0  0]]

スポンサーリンク

## Broadcasting, row, column and matrix operations¶

# operations across rows, cols or entire matrix
print(xs.max())
print(xs.max(axis=0)) # max of each col
print(xs.max(axis=1)) # max of each row

123
[111 119 103 123 115 121]
[111 113  97 109  89  95 121 123 119  97]

# A funcitonal rather than object-oriented approacha also wokrs
print(np.max(xs, axis=0))
print(np.max(xs, axis=1))

[111 119 103 123 115 121]
[111 113  97 109  89  95 121 123 119  97]

# broadcasting
xs = np.arange(12).reshape(2,6)
print(xs, '\n')
print(xs * 10, '\n')

# broadcasting just works when doing column-wise operations
col_means = xs.mean(axis=0)
print(col_means, '\n')
print(xs + col_means, '\n')

# but needs a little more work for row-wise operations
row_means = xs.mean(axis=1)[:, np.newaxis]
print(row_means)
print(xs + row_means)

[[ 0  1  2  3  4  5]
[ 6  7  8  9 10 11]]

[[  0  10  20  30  40  50]
[ 60  70  80  90 100 110]]

[3. 4. 5. 6. 7. 8.]

[[ 3.  5.  7.  9. 11. 13.]
[ 9. 11. 13. 15. 17. 19.]]

[[2.5]
[8.5]]
[[ 2.5  3.5  4.5  5.5  6.5  7.5]
[14.5 15.5 16.5 17.5 18.5 19.5]]

# convert matrix to have zero mean and unit standard deviation
# using col summary statistics
print((xs - xs.mean(axis=0))/xs.std(axis=0))

[[-1. -1. -1. -1. -1. -1.]
[ 1.  1.  1.  1.  1.  1.]]

# convert matrix to have zero mean and unit standard deviation
# using row summary statistics
print((xs - xs.mean(axis=1)[:, np.newaxis])/xs.std(axis=1)[:, np.newaxis])

[[-1.4639 -0.8783 -0.2928  0.2928  0.8783  1.4639]
[-1.4639 -0.8783 -0.2928  0.2928  0.8783  1.4639]]

# broadcasting for outer product
# e.g. create the 12x12 multiplication toable
u = np.arange(1, 13)
u[:,None] * u[None,:]

array([[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12],
[  2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24],
[  3,   6,   9,  12,  15,  18,  21,  24,  27,  30,  33,  36],
[  4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48],
[  5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60],
[  6,  12,  18,  24,  30,  36,  42,  48,  54,  60,  66,  72],
[  7,  14,  21,  28,  35,  42,  49,  56,  63,  70,  77,  84],
[  8,  16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96],
[  9,  18,  27,  36,  45,  54,  63,  72,  81,  90,  99, 108],
[ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120],
[ 11,  22,  33,  44,  55,  66,  77,  88,  99, 110, 121, 132],
[ 12,  24,  36,  48,  60,  72,  84,  96, 108, 120, 132, 144]])

• (0,0)
• (4,0)
• (4,3)
• (0,3)
def distance_matrix_py(pts):
"""Returns matrix of pairwise Euclidean distances. Pure Python version."""
n = len(pts)
p = len(pts[0])
m = np.zeros((n, n))
for i in range(n):
for j in range(n):
s = 0
for k in range(p):
s += (pts[i,k] - pts[j,k])**2
m[i, j] = s**0.5
return m

def distance_matrix_np(pts):
"""Returns matrix of pairwise Euclidean distances. Vectorized numpy version."""
return np.sum((pts[None,:] - pts[:, None])**2, -1)**0.5

pts = np.array([(0,0), (4,0), (4,3), (0,3)])

distance_matrix_py(pts)

array([[0., 4., 5., 3.],
[4., 0., 3., 5.],
[5., 3., 0., 4.],
[3., 5., 4., 0.]])
distance_matrix_np(pts)

array([[0., 4., 5., 3.],
[4., 0., 3., 5.],
[5., 3., 0., 4.],
[3., 5., 4., 0.]])
# Broaccasting and vectorization is faster than looping
%timeit distance_matrix_py(pts)
%timeit distance_matrix_np(pts)

48.8 µs ± 599 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
7.98 µs ± 58.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


python loopよりもnumpy vectorizationの方が処理がかなり高速だ。

スポンサーリンク
スポンサーリンク

フォローする