Python Python NumPy Essentials: Numerical Computing in Python

NumPy Essentials: Numerical Computing in Python

AS
Aman Saurav
| Dec 25, 2024 |
read
#numpy #arrays #scientific-computing #data-science

NumPy Essentials: Numerical Computing in Python

NumPy (Numerical Python) is the fundamental package for scientific computing in Python. It provides powerful array objects and tools for working with numerical data efficiently.

Why NumPy?

Performance Comparison

import numpy as np
import time

# Python list
python_list = list(range(1000000))
start = time.time()
result = [x * 2 for x in python_list]
print(f"Python list: {time.time() - start:.4f}s")

# NumPy array
numpy_array = np.arange(1000000)
start = time.time()
result = numpy_array * 2
print(f"NumPy array: {time.time() - start:.4f}s")

# Output:
# Python list: 0.0823s
# NumPy array: 0.0012s  ← 68x faster!

Why NumPy is Faster:

  • Written in C
  • Vectorized operations
  • Contiguous memory storage
  • No Python overhead

Installation

# Using pip
pip install numpy

# Using conda
conda install numpy

# Verify installation
python -c "import numpy; print(numpy.__version__)"

NumPy Arrays

Creating Arrays

import numpy as np

# From Python list
arr = np.array([1, 2, 3, 4, 5])
print(arr)  # [1 2 3 4 5]

# 2D array
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
print(arr_2d)
# [[1 2 3]
#  [4 5 6]]

# Using built-in functions
zeros = np.zeros((3, 4))        # 3x4 array of zeros
ones = np.ones((2, 3))          # 2x3 array of ones
empty = np.empty((2, 2))        # Uninitialized array
full = np.full((3, 3), 7)       # 3x3 array filled with 7

# Ranges
arange = np.arange(0, 10, 2)    # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1.]

# Random arrays
random = np.random.random((3, 3))      # Uniform [0, 1)
randn = np.random.randn(3, 3)          # Standard normal
randint = np.random.randint(0, 10, 5)  # Random integers

# Identity matrix
identity = np.eye(3)

Array Attributes

arr = np.array([[1, 2, 3], [4, 5, 6]])

print(arr.shape)      # (2, 3) - dimensions
print(arr.ndim)       # 2 - number of dimensions
print(arr.size)       # 6 - total elements
print(arr.dtype)      # int64 - data type
print(arr.itemsize)   # 8 - bytes per element
print(arr.nbytes)     # 48 - total bytes

Array Operations

Arithmetic Operations

a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])

# Element-wise operations
print(a + b)    # [11 22 33 44]
print(a - b)    # [-9 -18 -27 -36]
print(a * b)    # [10 40 90 160]
print(a / b)    # [0.1 0.1 0.1 0.1]
print(a ** 2)   # [1 4 9 16]

# Scalar operations
print(a + 10)   # [11 12 13 14]
print(a * 2)    # [2 4 6 8]

# Mathematical functions
print(np.sqrt(a))      # [1. 1.41 1.73 2.]
print(np.exp(a))       # [2.72 7.39 20.09 54.60]
print(np.log(a))       # [0. 0.69 1.10 1.39]
print(np.sin(a))       # [0.84 0.91 0.14 -0.76]

Aggregation Functions

arr = np.array([[1, 2, 3], [4, 5, 6]])

print(arr.sum())        # 21 - total sum
print(arr.min())        # 1 - minimum
print(arr.max())        # 6 - maximum
print(arr.mean())       # 3.5 - average
print(arr.std())        # 1.71 - standard deviation
print(arr.var())        # 2.92 - variance

# Axis-specific operations
print(arr.sum(axis=0))  # [5 7 9] - column sums
print(arr.sum(axis=1))  # [6 15] - row sums
print(arr.mean(axis=0)) # [2.5 3.5 4.5] - column means

Indexing and Slicing

Basic Indexing

arr = np.array([10, 20, 30, 40, 50])

print(arr[0])      # 10 - first element
print(arr[-1])     # 50 - last element
print(arr[1:4])    # [20 30 40] - slice
print(arr[::2])    # [10 30 50] - every other
print(arr[::-1])   # [50 40 30 20 10] - reverse

2D Indexing

arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

print(arr_2d[0, 0])      # 1 - element at (0,0)
print(arr_2d[1, 2])      # 6 - element at (1,2)
print(arr_2d[0])         # [1 2 3] - first row
print(arr_2d[:, 0])      # [1 4 7] - first column
print(arr_2d[0:2, 1:3])  # [[2 3] [5 6]] - subarray

Boolean Indexing

arr = np.array([1, 2, 3, 4, 5, 6])

# Create boolean mask
mask = arr > 3
print(mask)  # [False False False True True True]

# Filter array
print(arr[mask])  # [4 5 6]

# One-liner
print(arr[arr > 3])  # [4 5 6]

# Multiple conditions
print(arr[(arr > 2) & (arr < 5)])  # [3 4]

Fancy Indexing

arr = np.array([10, 20, 30, 40, 50])

# Index with array of indices
indices = [0, 2, 4]
print(arr[indices])  # [10 30 50]

# 2D fancy indexing
arr_2d = np.array([[1, 2], [3, 4], [5, 6]])
rows = [0, 1, 2]
cols = [1, 0, 1]
print(arr_2d[rows, cols])  # [2 3 6]

Array Manipulation

Reshaping

arr = np.arange(12)
print(arr)  # [0 1 2 3 4 5 6 7 8 9 10 11]

# Reshape to 2D
reshaped = arr.reshape(3, 4)
print(reshaped)
# [[ 0  1  2  3]
#  [ 4  5  6  7]
#  [ 8  9 10 11]]

# Reshape to 3D
reshaped_3d = arr.reshape(2, 3, 2)

# Flatten
flattened = reshaped.flatten()  # [0 1 2 ... 11]
ravel = reshaped.ravel()        # Same, but view

# Transpose
transposed = reshaped.T
print(transposed.shape)  # (4, 3)

Stacking and Splitting

a = np.array([1, 2, 3])
b = np.array([4, 5, 6])

# Vertical stack
vstacked = np.vstack([a, b])
# [[1 2 3]
#  [4 5 6]]

# Horizontal stack
hstacked = np.hstack([a, b])
# [1 2 3 4 5 6]

# Concatenate
concatenated = np.concatenate([a, b])

# Split
arr = np.arange(9)
split = np.split(arr, 3)  # [array([0,1,2]), array([3,4,5]), array([6,7,8])]

Broadcasting

Broadcasting: NumPy’s way of performing operations on arrays of different shapes

# Scalar broadcasting
arr = np.array([1, 2, 3])
print(arr + 10)  # [11 12 13]

# 1D to 2D broadcasting
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
arr_1d = np.array([10, 20, 30])
print(arr_2d + arr_1d)
# [[11 22 33]
#  [14 25 36]]

# Column vector broadcasting
col = np.array([[1], [2], [3]])
row = np.array([10, 20, 30])
print(col + row)
# [[11 21 31]
#  [12 22 32]
#  [13 23 33]]

Broadcasting Rules

  1. Arrays with fewer dimensions are padded with ones on the left
  2. Dimensions of size 1 are stretched to match the other array
  3. If dimensions don’t match and neither is 1, error is raised
# Compatible shapes
(3, 4) + (4,)     (3, 4) + (1, 4)  (3, 4)
(3, 1) + (1, 4)   (3, 4)

# Incompatible shapes
(3, 4) + (3,)     Error! (can't broadcast)

Linear Algebra

# Matrix multiplication
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Dot product
C = np.dot(A, B)  # or A @ B
print(C)
# [[19 22]
#  [43 50]]

# Determinant
det = np.linalg.det(A)  # -2.0

# Inverse
inv = np.linalg.inv(A)
print(inv)
# [[-2.   1. ]
#  [ 1.5 -0.5]]

# Eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)

# Solve linear system Ax = b
b = np.array([1, 2])
x = np.linalg.solve(A, b)

Statistics

data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

print(np.mean(data))       # 5.5 - mean
print(np.median(data))     # 5.5 - median
print(np.std(data))        # 2.87 - standard deviation
print(np.var(data))        # 8.25 - variance
print(np.percentile(data, 75))  # 7.75 - 75th percentile
print(np.corrcoef(data, data))  # Correlation coefficient

Practical Examples

Image Processing

# Load image as array (using PIL/Pillow)
from PIL import Image
img = Image.open('photo.jpg')
img_array = np.array(img)

print(img_array.shape)  # (height, width, 3) for RGB

# Grayscale conversion
gray = np.mean(img_array, axis=2)

# Crop image
cropped = img_array[100:400, 200:500]

# Flip image
flipped = np.flip(img_array, axis=0)

# Rotate 90 degrees
rotated = np.rot90(img_array)

Data Normalization

data = np.random.randn(1000, 5)  # Random data

# Min-Max normalization (0 to 1)
normalized = (data - data.min()) / (data.max() - data.min())

# Z-score normalization (mean=0, std=1)
standardized = (data - data.mean(axis=0)) / data.std(axis=0)

Moving Average

def moving_average(data, window_size):
    """Calculate moving average"""
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

prices = np.array([100, 102, 98, 105, 110, 108, 112])
ma = moving_average(prices, window_size=3)
print(ma)  # [100. 101.67 104.33 107.67 110.]

Performance Tips

1. Vectorization

# ❌ Slow: Python loop
result = []
for x in range(1000000):
    result.append(x ** 2)

# ✅ Fast: Vectorized
result = np.arange(1000000) ** 2

2. Pre-allocate Arrays

# ❌ Slow: Growing array
arr = np.array([])
for i in range(1000):
    arr = np.append(arr, i)

# ✅ Fast: Pre-allocate
arr = np.zeros(1000)
for i in range(1000):
    arr[i] = i

3. Use Views Instead of Copies

arr = np.arange(1000000)

# View (fast, no copy)
view = arr[::2]

# Copy (slow, duplicates data)
copy = arr[::2].copy()

4. Use Appropriate Data Types

# ❌ Wasteful: float64 for small integers
arr = np.array([1, 2, 3], dtype=np.float64)  # 8 bytes each

# ✅ Efficient: int8 for small integers
arr = np.array([1, 2, 3], dtype=np.int8)     # 1 byte each

Common Pitfalls

1. Copy vs. View

arr = np.array([1, 2, 3, 4, 5])

# This is a view!
view = arr[1:4]
view[0] = 999
print(arr)  # [1 999 3 4 5] ← Original changed!

# Make a copy to avoid this
copy = arr[1:4].copy()
copy[0] = 999
print(arr)  # [1 2 3 4 5] ← Original unchanged

2. Integer Division

# Python 3 behavior
print(5 / 2)  # 2.5

# NumPy with integers
arr = np.array([5, 7, 9])
print(arr / 2)  # [2.5 3.5 4.5] ← Converts to float

# Integer division
print(arr // 2)  # [2 3 4]

3. Dimension Mismatch

# This works (broadcasting)
arr = np.array([[1, 2, 3], [4, 5, 6]])
print(arr + np.array([10, 20, 30]))  # OK

# This doesn't (incompatible shapes)
print(arr + np.array([10, 20]))  # Error!

Resources

Conclusion

NumPy is essential for:

  • ✅ Fast numerical computations
  • ✅ Scientific computing
  • ✅ Data analysis
  • ✅ Machine learning (foundation for pandas, scikit-learn, TensorFlow)

Key Takeaways:

  1. Use vectorized operations instead of loops
  2. Understand broadcasting for efficient code
  3. Be aware of views vs. copies
  4. Choose appropriate data types for memory efficiency

Master NumPy, and you’ll have a solid foundation for data science in Python!