들어가며
딥러닝 코드를 보다 보면 NumPy와 Pandas가 항상 등장한다.
파이썬은 쓸 수 있지만 이 두 라이브러리는 딥러닝에 특화된 방식으로 쓰이기 때문에 따로 정리해둘 필요가 있다.
특히 데이터 전처리 단계에서 NumPy와 Pandas 없이는 거의 불가능하다.
NumPy 핵심
배열 생성
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([[1, 2, 3], [4, 5, 6]])
print(a.shape)
print(b.shape)
print(b.ndim)
zeros = np.zeros((3, 4))
ones = np.ones((3, 4))
eye = np.eye(4)
rand = np.random.randn(3, 4)
arange = np.arange(0, 10, 2)
linspace = np.linspace(0, 1, 5)
인덱싱과 슬라이싱
a = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(a[0])
print(a[0, 1])
print(a[:2])
print(a[:, 1])
print(a[a > 5])
print(a[[0, 2]])
Shape 변환
a = np.arange(24)
b = a.reshape(4, 6)
c = a.reshape(2, 3, 4)
d = a.reshape(4, -1)
flat = b.flatten()
x = np.array([1, 2, 3])
x1 = x[np.newaxis, :]
x2 = x[:, np.newaxis]
image = np.random.randn(3, 224, 224)
batch = np.expand_dims(image, axis=0)
수학 연산
a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
print(a + b)
print(a * b)
print(a ** 2)
print(np.sqrt(a))
A = np.random.randn(3, 4)
B = np.random.randn(4, 5)
C = A @ B
x = np.array([[1, 2, 3], [4, 5, 6]])
print(x.sum())
print(x.sum(axis=0))
print(x.sum(axis=1))
print(x.argmax(axis=1))
브로드캐스팅
X = np.random.randn(32, 10)
mean = X.mean(axis=0)
std = X.std(axis=0)
X_norm = (X - mean) / (std + 1e-8)
print(X_norm.mean(axis=0).round(4))
print(X_norm.std(axis=0).round(4))
유용한 함수들
a = np.array([-2, -1, 0, 1, 2])
print(np.where(a > 0, a, 0))
print(np.clip(a, -1, 1))
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
print(np.stack([x, y]))
print(np.concatenate([x, y]))
a = np.array([3, 1, 4, 1, 5])
print(np.sort(a))
print(np.argsort(a))
Pandas 핵심
DataFrame 생성
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 28],
'score': [85.5, 92.0, 78.3, 88.1],
'passed': [True, True, False, True]
})
print(df.shape)
print(df.dtypes)
print(df.describe())
df = pd.read_csv('data.csv')
df.to_csv('output.csv', index=False)
데이터 탐색
print(df.head(3))
print(df.tail(3))
print(df['name'])
print(df[['name', 'score']])
print(df.loc[0])
print(df.iloc[0:2, 0:2])
print(df[df['age'] > 28])
print(df[(df['age'] > 25) & (df['passed'] == True)])
print(df[df['name'].isin(['Alice', 'Bob'])])
결측값 처리
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [np.nan, 2, 3, np.nan],
})
print(df.isnull().sum())
df_mean = df.fillna(df.mean())
df_zero = df.fillna(0)
df_drop = df.dropna()
df_ffill = df.fillna(method='ffill')
데이터 변환
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'score': [85.5, 92.0, 78.3],
'grade': ['A', 'A', 'B']
})
df['score_norm'] = (df['score'] - df['score'].mean()) / df['score'].std()
df['name_len'] = df['name'].apply(len)
df['score_level'] = df['score'].apply(lambda x: 'high' if x >= 85 else 'low')
grade_map = {'A': 4.0, 'B': 3.0, 'C': 2.0}
df['gpa'] = df['grade'].map(grade_map)
그룹화와 집계
df = pd.DataFrame({
'department': ['개발', '개발', '디자인', '디자인'],
'salary': [5000, 6000, 4500, 5500],
'score': [85, 92, 78, 88]
})
print(df.groupby('department')['salary'].mean())
print(df.groupby('department').agg({
'salary': ['mean', 'sum'],
'score': 'mean'
}))
딥러닝 전처리 실전
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
df = pd.read_csv('titanic.csv')
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
X = df.drop('Survived', axis=1).values
y = df['Survived'].values
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t = torch.tensor(X_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val, dtype=torch.long)
print(f"학습: {X_train_t.shape}, 검증: {X_val_t.shape}")
NumPy ↔ Pandas ↔ PyTorch 변환
import numpy as np
import pandas as pd
import torch
arr = np.random.randn(5, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
arr_back = df.values
tensor = torch.from_numpy(arr)
arr_from = tensor.detach().numpy()
실무에서 자주 쓰는 패턴
def eda(df):
print(f"Shape: {df.shape}")
print(f"\n결측값:\n{df.isnull().sum()}")
print(f"\n수치형 통계:\n{df.describe()}")
for col in df.select_dtypes(include='object').columns:
print(f"\n{col}: {df[col].value_counts().to_dict()}")
def check_balance(y):
unique, counts = np.unique(y, return_counts=True)
for cls, cnt in zip(unique, counts):
print(f"클래스 {cls}: {cnt}개 ({100*cnt/len(y):.1f}%)")
def create_batches(X, y, batch_size=32, shuffle=True):
n = len(X)
indices = np.random.permutation(n) if shuffle else np.arange(n)
for start in range(0, n, batch_size):
idx = indices[start:start+batch_size]
yield X[idx], y[idx]
정리
| 항목 |
NumPy |
Pandas |
| 주요 자료구조 |
ndarray |
DataFrame, Series |
| 사용 목적 |
수치 연산, 행렬 계산 |
테이블 데이터 처리 |
| 인덱싱 |
위치/불리언 기반 |
loc(레이블), iloc(위치) |
| 결측값 |
nan |
isnull(), fillna() |
| 딥러닝 활용 |
텐서 변환 직전 |
전처리, EDA |