ttnlsc · ttnlsc · Feb 10, 2024 · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/hw1/code/KNN.ipynb b/hw1/code/KNN.ipynb
diff --git a/hw1/code/knn.py b/hw1/code/knn.py
@@ -0,0 +1,156 @@
+import numpy as np
+
+
+class KNNClassifier:
+    """
+    K-neariest-neighbor classifier using L1 loss
+    """
+
+    def __init__(self, k=1):
+        self.k = k
+
+
+    def fit(self, X, y):
+        self.train_X = X
+        self.train_y = y
+
+
+    def predict(self, X, n_loops=0):
+        """
+        Uses the KNN model to predict clases for the data samples provided
+
+        Arguments:
+        X, np array (num_samples, num_features) - samples to run
+           through the model
+        num_loops, int - which implementation to use
+
+        Returns:
+        predictions, np array of ints (num_samples) - predicted class
+           for each sample
+        """
+
+        if n_loops == 0:
+            distances = self.compute_distances_no_loops(X)
+        elif n_loops == 1:
+            distances = self.compute_distances_one_loops(X)
+        else:
+            distances = self.compute_distances_two_loops(X)
+
+        if len(np.unique(self.train_y)) == 2:
+            return self.predict_labels_binary(distances)
+        else:
+            return self.predict_labels_multiclass(distances)
+
+
+    def compute_distances_two_loops(self, X):
+        """
+        Computes L1 distance from every sample of X to every training sample
+        Uses simplest implementation with 2 Python loops
+
+        Arguments:
+        X, np array (num_test_samples, num_features) - samples to run
+
+        Returns:
+        distances, np array (num_test_samples, num_train_samples) - array
+           with distances between each test and each train sample
+        """
+
+        num_test = X.shape[0]
+        num_train = self.train_X.shape[0]
+        distances = np.zeros((num_test, num_train))
+
+        for i in range(num_test):
+            for j in range(num_train):
+                distances[i, j] = np.sum(np.abs(X[i] - self.train_X[j]))
+
+        return distances
+
+
+    def compute_distances_one_loop(self, X):
+        """
+        Computes L1 distance from every sample of X to every training sample
+        Vectorizes some of the calculations, so only 1 loop is used
+
+        Arguments:
+        X, np array (num_test_samples, num_features) - samples to run
+
+        Returns:
+        distances, np array (num_test_samples, num_train_samples) - array
+           with distances between each test and each train sample
+        """
+
+        num_test = X.shape[0]
+        num_train = self.train_X.shape[0]
+        distances = np.zeros((num_test, num_train))
+
+        for i in range(num_test):
+            distances[i, :] = np.sum(np.abs(X[i] - self.train_X), axis=1)
+
+        return distances
+
+
+    def compute_distances_no_loops(self, X):
+        """
+        Computes L1 distance from every sample of X to every training sample
+        Fully vectorizes the calculations using numpy
+
+        Arguments:
+        X, np array (num_test_samples, num_features) - samples to run
+
+        Returns:
+        distances, np array (num_test_samples, num_train_samples) - array
+           with distances between each test and each train sample
+        """
+
+        X_test_expanded = np.expand_dims(X, axis=1)
+        X_train_expanded = np.expand_dims(self.train_X, axis=0)
+        distances = np.sum(np.abs(X_test_expanded - X_train_expanded), axis=2)
+
+        return distances
+
+
+
+    def predict_labels_binary(self, distances):
+        """
+        Returns model predictions for binary classification case
+
+        Arguments:
+        distances, np array (num_test_samples, num_train_samples) - array
+           with distances between each test and each train sample
+        Returns:
+        pred, np array of bool (num_test_samples) - binary predictions 
+           for every test sample
+        """
+
+        n_test = distances.shape[0]
+        prediction = np.zeros(n_test, dtype=bool)
+
+        for i in range(n_test):
+            nearest_indices = np.argsort(distances[i])[:self.k]
+            count_class_1 = np.sum(self.train_y[nearest_indices] == 1)
+            prediction[i] = count_class_1 > (self.k / 2)
+
+        return prediction
+
+
+    def predict_labels_multiclass(self, distances):
+        """
+        Returns model predictions for multi-class classification case
+
+        Arguments:
+        distances, np array (num_test_samples, num_train_samples) - array
+           with distances between each test and each train sample
+        Returns:
+        pred, np array of int (num_test_samples) - predicted class index 
+           for every test sample
+        """
+
+        n_test = distances.shape[0]
+        prediction = np.zeros(n_test, dtype=int)
+
+        for i in range(n_test):
+            nearest_indices = np.argsort(distances[i])[:self.k]
+            counts = np.bincount(self.train_y[nearest_indices])
+            prediction[i] = np.argmax(counts)
+
+        return prediction
diff --git a/hw1/code/metrics.py b/hw1/code/metrics.py
@@ -0,0 +1,99 @@
+import numpy as np
+
+
+def binary_classification_metrics(y_pred, y_true):
+    """
+    Computes metrics for binary classification
+    Arguments:
+    y_pred, np array (num_samples) - model predictions
+    y_true, np array (num_samples) - true labels
+    Returns:
+    precision, recall, f1, accuracy - classification metrics
+    """
+
+    labels = np.array([0, 1])
+    cm = np.zeros((2, 2), dtype=int)
+    for i in range(len(y_true)):
+        true_label = np.where(labels == y_true[i])[0][0]
+        pred_label = np.where(labels == y_pred[i])[0][0]
+        cm[true_label, pred_label] += 1
+    tp = cm[1, 1]
+    tn = cm[0, 0]
+    fp = cm[0, 1]
+    fn = cm[1, 0]
+    accuracy = (tp + tn) / cm.sum()
+    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
+
+    return precision, recall, f1, accuracy
+
+
+def multiclass_accuracy(y_pred, y_true):
+    """
+    Computes metrics for multiclass classification
+    Arguments:
+    y_pred, np array of int (num_samples) - model predictions
+    y_true, np array of int (num_samples) - true labels
+    Returns:
+    accuracy - ratio of accurate predictions to total samples
+    """
+
+    labels = np.unique(y_true)
+    cm = np.zeros((len(labels), len(labels)), dtype=int)
+    for i in range(len(y_true)):
+        true_label = np.where(labels == y_true[i])[0][0]
+        pred_label = np.where(labels == y_pred[i])[0][0]
+        cm[true_label, pred_label] += 1
+
+    tp = np.diag(cm).sum()
+
+    accuracy = tp / cm.sum()
+
+    return accuracy
+
+
+def r_squared(y_pred, y_true):
+    """
+    Computes r-squared for regression
+    Arguments:
+    y_pred, np array of int (num_samples) - model predictions
+    y_true, np array of int (num_samples) - true values
+    Returns:
+    r2 - r-squared value
+    """
+
+    y_mean = np.mean(y_true)
+    ss_res = np.sum((y_true - y_pred) ** 2)
+    ss_total = np.sum((y_true - y_mean) ** 2)
+    return 1 - (ss_res / ss_total)
+
+
+def mse(y_pred, y_true):
+    """
+    Computes mean squared error
+    Arguments:
+    y_pred, np array of int (num_samples) - model predictions
+    y_true, np array of int (num_samples) - true values
+    Returns:
+    mse - mean squared error
+    """
+
+    n = len(y_pred)
+    ss_res = np.sum((y_true - y_pred) ** 2)
+    return ss_res / n
+
+
+def mae(y_pred, y_true):
+    """
+    Computes mean absolut error
+    Arguments:
+    y_pred, np array of int (num_samples) - model predictions
+    y_true, np array of int (num_samples) - true values
+    Returns:
+    mae - mean absolut error
+    """
+
+    n = len(y_pred)
+    s_res = np.sum(np.abs(y_true - y_pred))
+    return s_res / n
diff --git a/hw1/code/my_awesome_eda.py b/hw1/code/my_awesome_eda.py
@@ -0,0 +1,136 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+plt.rcParams.update({'font.weight': 'normal'})
+
+def run_eda(df: pd.DataFrame, category_values: int = 0) -> None:
+    '''
+    Perform Exploratory Data Analysis (EDA) on the given DataFrame.
+
+    Args:
+    - df (pd.DataFrame): The DataFrame to analyze.
+    - category_values (int): The threshold for unique values to consider a column as categorical.
+    Columns with unique values less than or equal to this threshold will be categorized.
+
+    Returns:
+    None
+    '''
+    # Greeting
+    print('Praise the Omnissiah! Welcome to the Sanctum of Exploratory Data Analysis.\n')
+
+    # Number of observations and parameters
+    num_observations = df.shape[0]
+    num_parameters = df.shape[1]
+    print(f'Number of Observations (Rows): {num_observations}')
+    print(f'Number of Parameters (Columns): {num_parameters}')
+
+    # Data types of each column
+    print('\nData Types of Each Column:')
+    for column_name in df.columns:
+        unique_values = df[column_name].nunique()
+        if unique_values <= category_values:
+            df[column_name] = df[column_name].astype('category')
+    column_types = df.dtypes
+    max_len = max(map(len, column_types.index)) + 2
+    for column_name, data_type in column_types.items():
+        print(f'{column_name.ljust(max_len)} {data_type}')
+
+    # Categorize features into numerical, string, and categorical
+    print()
+    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
+    string_features = df.select_dtypes(include=['object']).columns
+    categorical_features = df.select_dtypes(include=['category']).columns
+    print(f'Numerical features: {", ".join(numerical_features) if not numerical_features.empty else 0}')
+    print(f'String features: {", ".join(string_features) if not string_features.empty else 0}')
+    print(f'Categorical features: {", ".join(categorical_features) if not categorical_features.empty else 0}')
+
+    # Counts and frequencies for categorical features
+    print('\nCounts and Frequencies for Categorical Features:')
+    for col in categorical_features:
+        counts = df[col].value_counts()
+        frequencies = counts / num_observations
+        count_df = pd.DataFrame({'count': counts, 'Frequency': frequencies})
+        count_df.index.name = col
+        count_df = count_df.sort_index()
+        print(count_df.to_string())
+
+    # Descriptive statistics for numerical features except ID
+    print('\nDescriptive Statistics for Numerical Features:')
+    numerical_features_selected = numerical_features[~numerical_features.str.contains('Id')]
+    numerical_statistics_selected = df.loc[:, numerical_features_selected].describe().round(2)
+    print(numerical_statistics_selected)
+
+    print('\nHistograms with Boxplots for Numerical Features:')
+    numerical_features_selected = numerical_features[~numerical_features.str.contains('Id')]
+    for col in numerical_features_selected:
+        plt.figure(figsize=(6, 3))
+        sns.set(style="whitegrid")
+        plt.subplot(1, 2, 1)
+        sns.histplot(df[col], bins=30, kde=False, color='red')
+        plt.title(f'Histogram of {col}', fontsize=10, fontweight='bold')
+        plt.xlabel(col, fontsize=8, fontweight='bold')
+        plt.ylabel('Count', fontsize=8, fontweight='bold')
+        plt.xticks(fontsize=8)
+        plt.yticks(fontsize=8)
+        plt.grid(False)
+
+        plt.subplot(1, 2, 2)
+        sns.boxplot(x=df[col], color='red')
+        plt.title(f'Boxplot of {col}', fontsize=8, fontweight='bold')
+        plt.xlabel(col, fontsize=6, fontweight='bold')
+        plt.xticks(fontsize=6)
+        plt.yticks(fontsize=6)
+        plt.grid(False)
+        plt.show()
+
+    print('\nCorrelation Heatmap:')
+    plt.figure(figsize=(6, 4))
+    sns.set(font_scale=1)
+    sns.heatmap(df[numerical_features_selected].corr(), annot=True, cmap='inferno', fmt=".2f")
+    plt.title('Correlation Heatmap', fontsize=10, fontweight='bold')
+    plt.xticks(fontsize=8)
+    plt.yticks(fontsize=8)
+    plt.show()
+
+    # Outliers for numerical features except ID
+    print('\nOutliers for Numerical Features:')
+    outliers = {}
+    for col in numerical_features_selected:
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        num_outliers = len(df[(df[col] < lower_bound) | (df[col] > upper_bound)])
+        outliers[col] = num_outliers
+    for col, num_outliers in outliers.items():
+        print(f'{col}: {num_outliers}')
+
+    # Missing values
+    print('\nMissing Values:')
+    total_missing = df.isnull().sum().sum()
+    rows_with_missing = df[df.isnull().any(axis=1)].shape[0]
+    columns_with_missing = df.columns[df.isnull().any()].tolist()
+    print(f'Total Missing Values: {total_missing}')
+    print(f'Rows with Missing Values: {rows_with_missing}')
+    print(f'Columns with Missing Values: {", ".join(columns_with_missing)}')
+
+    print('\nMissing Values Proportion:')
+    missing_proportion = df.isnull().mean()
+    plt.figure(figsize=(6, 3))
+    sns.set(style="whitegrid")
+    sns.barplot(x=missing_proportion.index, y=missing_proportion, color='red')
+    plt.title('Proportion of Missing Values for Each Variable', fontsize=10, fontweight='bold')
+    plt.xlabel('Variables', fontsize=8, fontweight='bold')
+    plt.ylabel('Proportion of Missing Values', fontsize=8, fontweight='bold')
+    plt.yticks(fontsize=8)
+    plt.xticks(rotation=45, ha='right', fontsize=8)    
+    plt.grid(False)
+    plt.show()
+
+    # Duplicate rows
+    print('\nDuplicate Rows:')
+    num_duplicates = df.duplicated().sum()
+    print(f'Number of Duplicate Rows: {num_duplicates}')
+    print('\nMay the data guide you, and the Omnissiah bless your analysis.')
diff --git a/hw1/data_folder/cat.jpeg b/hw1/data_folder/cat.jpeg
diff --git a/hw1/data_folder/photo_2024-02-11_11-03-30.jpg b/hw1/data_folder/photo_2024-02-11_11-03-30.jpg
diff --git a/hw1/data_folder/photo_2024-02-11_11-04-15.jpg b/hw1/data_folder/photo_2024-02-11_11-04-15.jpg