Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,207 changes: 2,207 additions & 0 deletions hw1/code/KNN.ipynb

Large diffs are not rendered by default.

156 changes: 156 additions & 0 deletions hw1/code/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import numpy as np


class KNNClassifier:
"""
K-neariest-neighbor classifier using L1 loss
"""

def __init__(self, k=1):
self.k = k


def fit(self, X, y):
self.train_X = X
self.train_y = y


def predict(self, X, n_loops=0):
"""
Uses the KNN model to predict clases for the data samples provided

Arguments:
X, np array (num_samples, num_features) - samples to run
through the model
num_loops, int - which implementation to use

Returns:
predictions, np array of ints (num_samples) - predicted class
for each sample
"""

if n_loops == 0:
distances = self.compute_distances_no_loops(X)
elif n_loops == 1:
distances = self.compute_distances_one_loops(X)
else:
distances = self.compute_distances_two_loops(X)

if len(np.unique(self.train_y)) == 2:
return self.predict_labels_binary(distances)
else:
return self.predict_labels_multiclass(distances)


def compute_distances_two_loops(self, X):
"""
Computes L1 distance from every sample of X to every training sample
Uses simplest implementation with 2 Python loops

Arguments:
X, np array (num_test_samples, num_features) - samples to run

Returns:
distances, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
"""

num_test = X.shape[0]
num_train = self.train_X.shape[0]
distances = np.zeros((num_test, num_train))

for i in range(num_test):
for j in range(num_train):
distances[i, j] = np.sum(np.abs(X[i] - self.train_X[j]))

return distances


def compute_distances_one_loop(self, X):
"""
Computes L1 distance from every sample of X to every training sample
Vectorizes some of the calculations, so only 1 loop is used

Arguments:
X, np array (num_test_samples, num_features) - samples to run

Returns:
distances, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
"""

num_test = X.shape[0]
num_train = self.train_X.shape[0]
distances = np.zeros((num_test, num_train))

for i in range(num_test):
distances[i, :] = np.sum(np.abs(X[i] - self.train_X), axis=1)

return distances


def compute_distances_no_loops(self, X):
"""
Computes L1 distance from every sample of X to every training sample
Fully vectorizes the calculations using numpy

Arguments:
X, np array (num_test_samples, num_features) - samples to run

Returns:
distances, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
"""

X_test_expanded = np.expand_dims(X, axis=1)
X_train_expanded = np.expand_dims(self.train_X, axis=0)
distances = np.sum(np.abs(X_test_expanded - X_train_expanded), axis=2)

return distances



def predict_labels_binary(self, distances):
"""
Returns model predictions for binary classification case

Arguments:
distances, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
Returns:
pred, np array of bool (num_test_samples) - binary predictions
for every test sample
"""

n_test = distances.shape[0]
prediction = np.zeros(n_test, dtype=bool)

for i in range(n_test):
nearest_indices = np.argsort(distances[i])[:self.k]
count_class_1 = np.sum(self.train_y[nearest_indices] == 1)
prediction[i] = count_class_1 > (self.k / 2)

return prediction


def predict_labels_multiclass(self, distances):
"""
Returns model predictions for multi-class classification case

Arguments:
distances, np array (num_test_samples, num_train_samples) - array
with distances between each test and each train sample
Returns:
pred, np array of int (num_test_samples) - predicted class index
for every test sample
"""

n_test = distances.shape[0]
prediction = np.zeros(n_test, dtype=int)

for i in range(n_test):
nearest_indices = np.argsort(distances[i])[:self.k]
counts = np.bincount(self.train_y[nearest_indices])
prediction[i] = np.argmax(counts)

return prediction
99 changes: 99 additions & 0 deletions hw1/code/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np


def binary_classification_metrics(y_pred, y_true):
"""
Computes metrics for binary classification
Arguments:
y_pred, np array (num_samples) - model predictions
y_true, np array (num_samples) - true labels
Returns:
precision, recall, f1, accuracy - classification metrics
"""

labels = np.array([0, 1])
cm = np.zeros((2, 2), dtype=int)
for i in range(len(y_true)):
true_label = np.where(labels == y_true[i])[0][0]
pred_label = np.where(labels == y_pred[i])[0][0]
cm[true_label, pred_label] += 1
tp = cm[1, 1]
tn = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
accuracy = (tp + tn) / cm.sum()
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
recall = tp / (tp + fn) if (tp + fn) != 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

return precision, recall, f1, accuracy


def multiclass_accuracy(y_pred, y_true):
"""
Computes metrics for multiclass classification
Arguments:
y_pred, np array of int (num_samples) - model predictions
y_true, np array of int (num_samples) - true labels
Returns:
accuracy - ratio of accurate predictions to total samples
"""

labels = np.unique(y_true)
cm = np.zeros((len(labels), len(labels)), dtype=int)
for i in range(len(y_true)):
true_label = np.where(labels == y_true[i])[0][0]
pred_label = np.where(labels == y_pred[i])[0][0]
cm[true_label, pred_label] += 1

tp = np.diag(cm).sum()

accuracy = tp / cm.sum()

return accuracy


def r_squared(y_pred, y_true):
"""
Computes r-squared for regression
Arguments:
y_pred, np array of int (num_samples) - model predictions
y_true, np array of int (num_samples) - true values
Returns:
r2 - r-squared value
"""

y_mean = np.mean(y_true)
ss_res = np.sum((y_true - y_pred) ** 2)
ss_total = np.sum((y_true - y_mean) ** 2)
return 1 - (ss_res / ss_total)


def mse(y_pred, y_true):
"""
Computes mean squared error
Arguments:
y_pred, np array of int (num_samples) - model predictions
y_true, np array of int (num_samples) - true values
Returns:
mse - mean squared error
"""

n = len(y_pred)
ss_res = np.sum((y_true - y_pred) ** 2)
return ss_res / n


def mae(y_pred, y_true):
"""
Computes mean absolut error
Arguments:
y_pred, np array of int (num_samples) - model predictions
y_true, np array of int (num_samples) - true values
Returns:
mae - mean absolut error
"""

n = len(y_pred)
s_res = np.sum(np.abs(y_true - y_pred))
return s_res / n
136 changes: 136 additions & 0 deletions hw1/code/my_awesome_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'font.weight': 'normal'})

def run_eda(df: pd.DataFrame, category_values: int = 0) -> None:
'''
Perform Exploratory Data Analysis (EDA) on the given DataFrame.

Args:
- df (pd.DataFrame): The DataFrame to analyze.
- category_values (int): The threshold for unique values to consider a column as categorical.
Columns with unique values less than or equal to this threshold will be categorized.

Returns:
None
'''
# Greeting
print('Praise the Omnissiah! Welcome to the Sanctum of Exploratory Data Analysis.\n')

# Number of observations and parameters
num_observations = df.shape[0]
num_parameters = df.shape[1]
print(f'Number of Observations (Rows): {num_observations}')
print(f'Number of Parameters (Columns): {num_parameters}')

# Data types of each column
print('\nData Types of Each Column:')
for column_name in df.columns:
unique_values = df[column_name].nunique()
if unique_values <= category_values:
df[column_name] = df[column_name].astype('category')
column_types = df.dtypes
max_len = max(map(len, column_types.index)) + 2
for column_name, data_type in column_types.items():
print(f'{column_name.ljust(max_len)} {data_type}')

# Categorize features into numerical, string, and categorical
print()
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
string_features = df.select_dtypes(include=['object']).columns
categorical_features = df.select_dtypes(include=['category']).columns
print(f'Numerical features: {", ".join(numerical_features) if not numerical_features.empty else 0}')
print(f'String features: {", ".join(string_features) if not string_features.empty else 0}')
print(f'Categorical features: {", ".join(categorical_features) if not categorical_features.empty else 0}')

# Counts and frequencies for categorical features
print('\nCounts and Frequencies for Categorical Features:')
for col in categorical_features:
counts = df[col].value_counts()
frequencies = counts / num_observations
count_df = pd.DataFrame({'count': counts, 'Frequency': frequencies})
count_df.index.name = col
count_df = count_df.sort_index()
print(count_df.to_string())

# Descriptive statistics for numerical features except ID
print('\nDescriptive Statistics for Numerical Features:')
numerical_features_selected = numerical_features[~numerical_features.str.contains('Id')]
numerical_statistics_selected = df.loc[:, numerical_features_selected].describe().round(2)
print(numerical_statistics_selected)

print('\nHistograms with Boxplots for Numerical Features:')
numerical_features_selected = numerical_features[~numerical_features.str.contains('Id')]
for col in numerical_features_selected:
plt.figure(figsize=(6, 3))
sns.set(style="whitegrid")
plt.subplot(1, 2, 1)
sns.histplot(df[col], bins=30, kde=False, color='red')
plt.title(f'Histogram of {col}', fontsize=10, fontweight='bold')
plt.xlabel(col, fontsize=8, fontweight='bold')
plt.ylabel('Count', fontsize=8, fontweight='bold')
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.grid(False)

plt.subplot(1, 2, 2)
sns.boxplot(x=df[col], color='red')
plt.title(f'Boxplot of {col}', fontsize=8, fontweight='bold')
plt.xlabel(col, fontsize=6, fontweight='bold')
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
plt.grid(False)
plt.show()

print('\nCorrelation Heatmap:')
plt.figure(figsize=(6, 4))
sns.set(font_scale=1)
sns.heatmap(df[numerical_features_selected].corr(), annot=True, cmap='inferno', fmt=".2f")
plt.title('Correlation Heatmap', fontsize=10, fontweight='bold')
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()

# Outliers for numerical features except ID
print('\nOutliers for Numerical Features:')
outliers = {}
for col in numerical_features_selected:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
num_outliers = len(df[(df[col] < lower_bound) | (df[col] > upper_bound)])
outliers[col] = num_outliers
for col, num_outliers in outliers.items():
print(f'{col}: {num_outliers}')

# Missing values
print('\nMissing Values:')
total_missing = df.isnull().sum().sum()
rows_with_missing = df[df.isnull().any(axis=1)].shape[0]
columns_with_missing = df.columns[df.isnull().any()].tolist()
print(f'Total Missing Values: {total_missing}')
print(f'Rows with Missing Values: {rows_with_missing}')
print(f'Columns with Missing Values: {", ".join(columns_with_missing)}')

print('\nMissing Values Proportion:')
missing_proportion = df.isnull().mean()
plt.figure(figsize=(6, 3))
sns.set(style="whitegrid")
sns.barplot(x=missing_proportion.index, y=missing_proportion, color='red')
plt.title('Proportion of Missing Values for Each Variable', fontsize=10, fontweight='bold')
plt.xlabel('Variables', fontsize=8, fontweight='bold')
plt.ylabel('Proportion of Missing Values', fontsize=8, fontweight='bold')
plt.yticks(fontsize=8)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.grid(False)
plt.show()

# Duplicate rows
print('\nDuplicate Rows:')
num_duplicates = df.duplicated().sum()
print(f'Number of Duplicate Rows: {num_duplicates}')
print('\nMay the data guide you, and the Omnissiah bless your analysis.')
Binary file added hw1/data_folder/cat.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/data_folder/photo_2024-02-11_11-03-30.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw1/data_folder/photo_2024-02-11_11-04-15.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.