diff --git a/run_ranker_dataset.py b/run_ranker_dataset.py new file mode 100644 index 0000000..22e8dfb --- /dev/null +++ b/run_ranker_dataset.py @@ -0,0 +1,444 @@ +import pickle +from pathlib import Path +from typing import Literal, Optional + +import numpy as np +import pandas as pd +import scipy.sparse as sps +from loguru import logger +from scipy.stats import kurtosis, skew +from sklearn.model_selection import KFold, ShuffleSplit +from tqdm import tqdm, trange + +from Data_manager.competition import load, load_raw +from Recommenders.BaseRecommender import BaseRecommender +from Recommenders.Hybrid import ( + ScoresMultipleHybridRecommender, + UserWideHybridRecommender, +) +from Recommenders.Similarity.Compute_Similarity import Compute_Similarity +from Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender + +logger.remove() +logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) + +USE = "training" +EXPERIMENT = "final" +NUMBER_FOLDS = 10 +CUTOFF = 50 + +TOP_POPULAR_THRESHOLDS = (10, 100, 1000) + +ITEM_LATENT_DIMENSIONS = 10 +USER_LATENT_DIMENSIONS = 10 + +RECOMMENDATION_MODELS_TO_USE = ( + 70, + 71, + 72, + 73, +) +SCORE_MODELS_TO_USE = ( + 20, + 21, + 22, + 23, +) +USE_SCORE_HYBRID = True +USE_USER_HYBRID = True +USER_WIDE_HYBRID_BEGIN = 30 + +OUTPUT_PATH = Path() / f"ranker_{USE}_data_{EXPERIMENT}.parquet" + +MODELS_BASE_DIR = Path() / "models" +TRAIN_MODELS_BASE_DIR = MODELS_BASE_DIR / "train" +TRAIN_MODELS_MAP_DIR = TRAIN_MODELS_BASE_DIR / "map" +TRAIN_MODELS_RECALL_DIR = TRAIN_MODELS_BASE_DIR / "recall" +SUBMISSION_MODELS_MAP_DIR = MODELS_BASE_DIR / "all" / "map" / "renamed" +SUBMISSION_MODELS_RECALL_DIR = Path( + str(SUBMISSION_MODELS_MAP_DIR).replace("map", "recall") +) + +NUMBER_GROUPS_USER_WIDE_HYBRID = 10 + +MULTIPLE_SCORE_HYBRID_WEIGHTS = { + 50: 0.253770701546336, + 51: 0.10324855050317669, +} + +SVD_FIT_PARAMS = { + "num_factors": 350, +} + +ROWS_PER_FOLD = 900_000 + +RNG = np.random.default_rng(42) + + +def build_user_wide_hybrid(urm: sps.csr_matrix, models: dict[str, BaseRecommender]): + profile_lengths = np.ediff1d(urm.indptr) + sorted_users = np.argsort(profile_lengths) + block_size = len(sorted_users) // NUMBER_GROUPS_USER_WIDE_HYBRID + group_users = {} + for group in range(NUMBER_GROUPS_USER_WIDE_HYBRID + 1): + group_users[group] = sorted_users[group * block_size : (group + 1) * block_size] + group_recommenders = { + group: models.pop(str(USER_WIDE_HYBRID_BEGIN + group)) + for group in range(NUMBER_GROUPS_USER_WIDE_HYBRID + 1) + } + return UserWideHybridRecommender(urm, group_users, group_recommenders) + + +def build_score_hybrid(urm: sps.csr_matrix, models: dict[str, BaseRecommender]): + recommenders = [ + models.pop(str(index)) for index in MULTIPLE_SCORE_HYBRID_WEIGHTS.keys() + ] + weights = list(MULTIPLE_SCORE_HYBRID_WEIGHTS.values()) + return ScoresMultipleHybridRecommender(urm, recommenders, weights) + + +def urm_df_to_csr( + urm_df: pd.DataFrame, number_users: int, number_items: int +) -> sps.csr_matrix: + return sps.csr_matrix( + (urm_df.data, (urm_df.user_id, urm_df.item_id)), + shape=(number_users, number_items), + ) + + +def row_statistics(df: pd.DataFrame, like: str) -> pd.DataFrame: + df = df.filter(like=like) + statistics_df = pd.DataFrame([], index=df.index) + statistics_df[f"{like}_mean"] = df.mean(axis="columns") + statistics_df[f"{like}_std"] = df.std(axis="columns") + statistics_df[f"{like}_min"] = df.min(axis="columns") + statistics_df[f"{like}_max"] = df.max(axis="columns") + statistics_df[f"{like}_kurtosis"] = kurtosis(df, axis=1) + statistics_df[f"{like}_skew"] = skew(df, axis=1) + return statistics_df + + +def compute_base_dataset( + number_users: int, + recommendation_models: dict[str, BaseRecommender], + score_models: dict[str, BaseRecommender], + cutoff: int, + fold: Optional[int] = None, +) -> pd.DataFrame: + dataset = pd.DataFrame(index=range(0, number_users), columns=["ItemID"]) + dataset.index.name = "UserID" + + recommendations_list = [] + recommenders_list = [] + rank_list = [] + for user_id in trange(number_users, desc="User (candidate)"): + user_recommendations = [] + user_recommenders = [] + user_rankings = [] + for name, recommender in recommendation_models.items(): + user_recommendations.extend( + recommender.recommend( + user_id, + cutoff=cutoff, + remove_seen_flag=True, + ) + ) + user_recommenders.extend([name] * cutoff) + user_rankings.extend(list(range(cutoff))) + recommendations_list.append(user_recommendations) + recommenders_list.append(user_recommenders) + rank_list.append(user_rankings) + + dataset["ItemID"] = recommendations_list + dataset["Recommender"] = recommenders_list + dataset["Ranking"] = rank_list + + exploded_recommender = dataset["Recommender"].explode() + exploded_ranking = dataset["Ranking"].explode() + dataset = dataset.explode("ItemID") + dataset["Recommender"] = exploded_recommender + dataset["Ranking"] = exploded_ranking.astype("int") + + recommender_agreement = ( + dataset.reset_index()[["UserID", "ItemID"]] + .groupby(["UserID", "ItemID"]) + .value_counts() + ) + dataset["recommender_agreement"] = recommender_agreement.loc[ + list(zip(dataset.index, dataset["ItemID"])) + ].to_numpy() + + for user_id in tqdm(dataset.index.unique(), desc="User (score)"): + for rec_label, rec_instance in score_models.items(): + item_list = dataset.loc[user_id, "ItemID"].to_list() + + all_item_scores = rec_instance._compute_item_score( + [user_id], items_to_compute=item_list + ) + + dataset.loc[user_id, f"score_{rec_label}"] = all_item_scores[0, item_list] + + score_statistics = row_statistics(dataset, "score") + dataset = pd.concat([dataset, score_statistics], axis="columns") + + dataset = dataset.reset_index() + dataset = dataset.rename(columns={"index": "UserID"}) + + if fold is not None: + dataset["fold"] = fold + + return dataset + + +def add_labels(training_df: pd.DataFrame, correct_recommendations_df: pd.DataFrame): + training_df = training_df.merge( + correct_recommendations_df, + on=["UserID", "ItemID"], + how="left", + indicator="Exist", + ) + training_df["Label"] = training_df["Exist"] == "both" + training_df = training_df.drop(columns=["Exist"]) + + if training_df.shape[0] > ROWS_PER_FOLD: + logger.debug("Reducing dataset from {} to {} rows", training_df.shape[0], ROWS_PER_FOLD) + logger.debug("Original positive rate {}", training_df["Label"].mean()) + positive_indices = training_df[training_df["Label"]].index + negative_indices = training_df[~training_df["Label"]].index + negative_sampled_indices = RNG.choice( + negative_indices, ROWS_PER_FOLD - len(positive_indices), replace=False + ) + training_df = training_df.loc[positive_indices.union(negative_sampled_indices)] + logger.debug("New positive rate {}", training_df["Label"].mean()) + + return training_df + + +def compute_correct_recommendations( + urm_val: sps.csr_matrix, +) -> pd.DataFrame: + urm_val_coo = sps.coo_matrix(urm_val) + return pd.DataFrame({"UserID": urm_val_coo.row, "ItemID": urm_val_coo.col}) + + +def load_models_fold( + urm: sps.csr_matrix, + fold_dir: Path, + use_only: Optional[list[str]] = None, + with_user_hybrid: bool = False, + with_score_hybrid: bool = False, +): + all_models = load_models_all(fold_dir) + models = all_models + if use_only is not None: + models = {str(index): models[str(index)] for index in use_only} + if with_user_hybrid and "user_wide_hybrid" not in models: + user_wide_hybrid = build_user_wide_hybrid(urm, all_models) + models["user_wide_hybrid"] = user_wide_hybrid + if with_score_hybrid and "score_hybrid" not in models: + score_hybrid = build_score_hybrid(urm, all_models) + models["score_hybrid"] = score_hybrid + + return models + + +def load_models_all(dir_: Path) -> dict[str, BaseRecommender]: + return {path.stem: pickle.load(path.open("rb")) for path in dir_.glob("*.pkl")} + + +def compute_training_dataset( + number_users: int, + number_items: int, + urm_df: pd.DataFrame, + folds: Optional[int] = None, +): + if folds is None: + split = ShuffleSplit(1, test_size=0.2, random_state=42) + else: + split = KFold(folds, shuffle=True, random_state=42) + + fold_training_datasets: dict[int, pd.DataFrame] = {} + for i, (train_indices, val_indices) in tqdm( + enumerate(split.split(urm_df)), + total=folds, + desc="Fold", + ): + fold_urm_train = urm_df_to_csr( + urm_df.iloc[train_indices], number_users, number_items + ) + fold_urm_val = urm_df_to_csr( + urm_df.iloc[val_indices], number_users, number_items + ) + + fold_recommendation_models_dir = TRAIN_MODELS_RECALL_DIR + fold_score_models_dir = TRAIN_MODELS_MAP_DIR + if folds is not None: + fold_recommendation_models_dir /= str(i) + fold_score_models_dir /= str(i) + + recommendation_models = load_models_fold( + fold_urm_train, + fold_recommendation_models_dir, + use_only=RECOMMENDATION_MODELS_TO_USE, + with_user_hybrid=False, + with_score_hybrid=False, + ) + logger.debug("Recommendation models: {}", recommendation_models.keys()) + score_models = load_models_fold( + fold_urm_train, + fold_score_models_dir, + use_only=SCORE_MODELS_TO_USE, + with_user_hybrid=USE_USER_HYBRID, + with_score_hybrid=USE_SCORE_HYBRID, + ) + logger.debug("Score models: {}", score_models.keys()) + + fold_training_dataset = compute_base_dataset( + number_users, + recommendation_models, + score_models, + CUTOFF, + fold=i, + ) + correct_recommendations_df = compute_correct_recommendations(fold_urm_val) + fold_training_dataset = add_labels( + fold_training_dataset, correct_recommendations_df + ) + fold_training_datasets[i] = fold_training_dataset + + return pd.concat(fold_training_datasets.values()) + + +def compute_submission_dataset(number_users: int): + recommendation_models = load_models_all(SUBMISSION_MODELS_RECALL_DIR) + score_models = load_models_all(SUBMISSION_MODELS_MAP_DIR) + return compute_base_dataset( + number_users, + recommendation_models, + score_models, + CUTOFF, + ) + + +def compute_dataset( + use: Literal["training", "submission"], + number_users: int, + number_items: int, + urm_df: pd.DataFrame, + folds: Optional[int] = None, +): + if use == "training": + return compute_training_dataset(number_users, number_items, urm_df, folds) + elif use == "submission": + return compute_submission_dataset(number_users) + + +def add_features(dataset: pd.DataFrame, urm: sps.csr_matrix, icm: sps.csr_matrix): + svd = PureSVDRecommender(urm) + svd.fit(**SVD_FIT_PARAMS) + # Item features + + ## Item popularity + item_popularity = np.ediff1d(sps.csc_matrix(urm).indptr) + + dataset["item_popularity"] = item_popularity[ + dataset["ItemID"].to_numpy().astype(int) + ] + + ## Distance to closest items + item_similarity = Compute_Similarity(icm.T).compute_similarity() + item_similarity + + mean_item_similarity_dict = {i: row.mean() for i, row in enumerate(item_similarity)} + mean_item_similarity: pd.DataFrame = pd.Series(mean_item_similarity_dict).to_frame( + name="item_similarity" + ) + mean_item_similarity + + dataset = dataset.join(mean_item_similarity, on="ItemID") + + ## Singular vectors + for i in range(ITEM_LATENT_DIMENSIONS): + dataset[f"item_svd_{i}"] = svd.ITEM_factors[ + dataset["ItemID"].to_numpy().astype(int), i + ] + + ## Autoencoder embeddings + encoder_embeddings: pd.DataFrame = pd.read_csv("reduced_features.csv", index_col=0) + enencoder_embeddings = encoder_embeddings.rename( + columns=lambda x: f"item_autoencoder_{x}" + ) + dataset = dataset.join(enencoder_embeddings, on="ItemID") + + # User features + + ## User popularity + user_popularity = np.ediff1d(sps.csr_matrix(urm).indptr) + + dataset["user_profile_len"] = user_popularity[ + dataset["UserID"].to_numpy().astype(int) + ] + + ## User popularity bias + # (measure of how much popularity influences the user) + item_popularity_ranking = item_popularity.argsort()[::-1] + item_popularity_ranking + + item_id_df = urm_df[["user_id", "item_id"]] + item_id_df + + for k in TOP_POPULAR_THRESHOLDS: + top_k_popular = item_popularity_ranking[:k] + item_id_df.loc[item_id_df["item_id"].isin(top_k_popular), f"top_{k}"] = 1 + item_id_df = item_id_df.fillna(0) + item_id_df + + user_top_k_df = item_id_df.groupby("user_id").aggregate( + {f"top_{k}": "sum" for k in TOP_POPULAR_THRESHOLDS} + ) + user_top_k_df + + dataset = dataset.join(user_top_k_df, on="UserID") + + ## Distance to closest users + user_similarity = Compute_Similarity(urm.T).compute_similarity() + user_similarity + + mean_user_similarity_dict = {i: row.mean() for i, row in enumerate(user_similarity)} + mean_user_similarity: pd.DataFrame = pd.Series(mean_user_similarity_dict).to_frame( + name="user_similarity" + ) + mean_user_similarity + + dataset = dataset.join(mean_user_similarity, on="UserID") + + ## Singular vectors + for i in range(USER_LATENT_DIMENSIONS): + dataset[f"user_svd_{i}"] = svd.USER_factors[ + dataset["UserID"].to_numpy().astype(int), i + ] + + return dataset + + +if __name__ == "__main__": + icm_df, urm_df = load_raw() + number_users = urm_df["user_id"].nunique() + number_items = icm_df["item_id"].nunique() + + icm_matrix, urm_all, *_ = load() + + dataset = compute_dataset( + use=USE, + number_users=number_users, + number_items=number_items, + urm_df=urm_df, + folds=NUMBER_FOLDS, + ) + + dataset = add_features(dataset, urm_all, icm_matrix) + + for categorical_column in ("UserID", "ItemID", "Recommender"): + dataset[categorical_column] = dataset[categorical_column].astype("category") + + dataset.to_parquet(OUTPUT_PATH) diff --git a/run_train_kfold.py b/run_train_kfold.py new file mode 100644 index 0000000..df62f80 --- /dev/null +++ b/run_train_kfold.py @@ -0,0 +1,410 @@ +import pickle +from pathlib import Path +from multiprocessing import cpu_count + +import scipy.sparse as sps +from sklearn.model_selection import KFold + +from Data_manager.competition import load_raw +from Recommenders.BaseRecommender import BaseRecommender +from Recommenders.NonPersonalizedRecommender import TopPop +from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender +from Recommenders.GraphBased.RP3betaRecommenderICM import RP3betaRecommenderICM +from Recommenders.KNN.ItemKNN_CFCBF_Hybrid_Recommender import ( + ItemKNN_CFCBF_Hybrid_Recommender, +) +from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender +from Recommenders.SLIM.SLIMElasticNetRecommender import ( + SLIMElasticNetRecommender, +) + +N_FOLDS = 10 +MODEL_DIR = Path() / "models" / "train" / "recall" + +HYPERPARAMETERS: dict[int, dict] = { + # User-wide hybrid 1 (0-10) + # 0: { + # "topK": 22, + # "alpha": 0.015137951778257512, + # "normalize_similarity": True, + # "implicit": True, + # }, + # 1: { + # "topK": 5, + # "shrink": 774, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.01, + # }, + # 2: { + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.2626851799303072, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.1560410093044209, + # }, + # 3: { + # "topK": 1000, + # "alpha": 1.8920160119169898, + # "beta": 0.4950301468130674, + # "delta": 0.30908791366521954, + # "normalize_similarity": True, + # }, + # 4: { + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.06864228467890522, + # }, + # 5: { + # "topK": 7, + # "shrink": 293, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0732688773175534, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.23668747670276377, + # }, + # 6: { + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.06565478344525211, + # }, + # 7: { + # "topK": 5, + # "shrink": 0, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0329315091653946, + # "feature_weighting": "BM25", + # "ICM_weight": 0.16124267891305158, + # }, + # 8: { + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.171628301912052, + # }, + # 9: { + # "topK": 10, + # "alpha": 0.35225624527493254, + # "normalize_similarity": True, + # "implicit": True, + # }, + # 10: { + # "topK": 44, + # "shrink": 473, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.35983197418129564, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.12542629369630146, + # }, + # 20: { # Item KNN CF+CBF + # "topK": 96, + # "shrink": 966, + # "similarity": "cosine", + # "normalize": True, + # "feature_weighting": "BM25", + # "ICM_weight": 0.015154282137075726, + # }, + # 21: { # SLIM ElasticNet + # "l1_ratio": 0.4408355927953408, + # "alpha": 0.00013519978876092592, + # "positive_only": False, + # "topK": 59, + # "do_feature_selection": True, + # }, + # 22: { # RP3 ICM + # "topK": 11, + # "alpha": 1.9811525250064195, + # "beta": 0.6832513917848906, + # "delta": 0.0037274512973076712, + # "normalize_similarity": True, + # "implicit": True, + # "min_rating": 1.0, + # }, + # 23: { # Item KNN CF + # "topK": 5, + # "shrink": 224, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # }, + # User-wide hybrid 2 (30-40) + # 30: { + # "topK": 1000, + # "l1_ratio": 0.0036552968571563925, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 31: { + # "topK": 1000, + # "l1_ratio": 0.0036439600383419896, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 32: { + # "topK": 1000, + # "l1_ratio": 0.01294361044706415, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 33: { + # "topK": 469, + # "l1_ratio": 0.0025724182700638666, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 34: { + # "topK": 1000, + # "l1_ratio": 0.012451061879323577, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 35: { + # "topK": 1000, + # "l1_ratio": 0.0037651439623475717, + # "alpha": 0.001, + # "positive_only": False, + # "do_feature_selection": True, + # }, + # 36: { + # "topK": 1000, + # "l1_ratio": 0.009466188626970398, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 37: { + # "topK": 196, + # "l1_ratio": 0.019833595367995636, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 38: { + # "topK": 145, + # "l1_ratio": 2.6489644774823373e-05, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 39: { + # "topK": 866, + # "l1_ratio": 0.019729118757762613, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 40: { + # "topK": 44, + # "shrink": 473, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.35983197418129564, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.12542629369630146, + # }, + # Score hybrid + # 50: { # RP3 ICM + # "topK": 79, + # "alpha": 0.7864757238135991, + # "beta": 0.443333110568691, + # "delta": 0.7593249588588719, + # "min_rating": 0.008553401844836345, + # "implicit": True, + # "normalize_similarity": True, + # }, + # 51: { # SLIM ElasticNet + # "l1_ratio": 0.04077479852537514, + # "alpha": 0.0004098922954204119, + # "positive_only": True, + # "topK": 144, + # "do_feature_selection": True, + # }, + # Recall@10 optimised models + # 60: { # SLIM ElasticNet + # "topK": 1000, + # "l1_ratio": 0.009196376132404047, + # "alpha": 0.001, + # "positive_only": True, + # "do_feature_selection": True, + # }, + # 61: { # Item KNN CF+CBF + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.0, + # "feature_weighting": "TF-IDF", + # "ICM_weight": 0.1918507776404466, + # }, + # 62: { # Item KNN CF + # "topK": 5, + # "shrink": 1000, + # "similarity": "asymmetric", + # "normalize": True, + # "asymmetric_alpha": 0.12250234857130494, + # "feature_weighting": "TF-IDF", + # }, + # 63: { # RP3 ICM + # "topK": 556, + # "alpha": 2.0, + # "beta": 0.43088991464943555, + # "delta": 0.0, + # "normalize_similarity": True, + # }, + # 64: {}, # Top Popular + # Recall@50 optimised models + 70: { # Item KNN CF + "topK": 152, + "shrink": 1000, + "similarity": "asymmetric", + "normalize": True, + "asymmetric_alpha": 0.2696194971486583, + "feature_weighting": "TF-IDF", + }, + 71: { # Item KNN CF+CBF + "topK": 43, + "shrink": 937, + "similarity": "asymmetric", + "normalize": True, + "asymmetric_alpha": 0.45165257781514373, + "feature_weighting": "TF-IDF", + "ICM_weight": 0.3749251701759684, + }, + 72: { # SLIM ElasticNet + "l1_ratio": 0.14747318214902194, + "alpha": 0.00043480530562990655, + "positive_only": False, + "topK": 75, + "do_feature_selection": True, + }, + 73: { # RP3 ICM + "topK": 351, + "alpha": 1.8421852767137328, + "beta": 0.26321333084561177, + "delta": 0.27195701300859715, + "normalize_similarity": True, + }, +} + + +def recommender_factory(urm, icm) -> dict[int, BaseRecommender]: + return { + # User-wide hybrid 1 (0-10) + # 0: P3alphaRecommender(urm), + # 1: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 2: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 3: RP3betaRecommenderICM(urm, icm), + # 4: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 5: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 6: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 7: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 8: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 9: P3alphaRecommender(urm), + # 10: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 20: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 21: SLIMElasticNetRecommender(urm), + # 22: RP3betaRecommenderICM(urm, icm), + # 23: ItemKNNCFRecommender(urm), + # User-wide hybrid 2 (30-40) + # 30: SLIMElasticNetRecommender(urm), + # 31: SLIMElasticNetRecommender(urm), + # 32: SLIMElasticNetRecommender(urm), + # 33: SLIMElasticNetRecommender(urm), + # 34: SLIMElasticNetRecommender(urm), + # 35: SLIMElasticNetRecommender(urm), + # 36: SLIMElasticNetRecommender(urm), + # 37: SLIMElasticNetRecommender(urm), + # 38: SLIMElasticNetRecommender(urm), + # 39: SLIMElasticNetRecommender(urm), + # 40: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # Score hybrid + # 50: RP3betaRecommenderICM(urm, icm), + # 51: SLIMElasticNetRecommender(urm), + # Recall@10 optimised models + # 60: SLIMElasticNetRecommender(urm), + # 61: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + # 62: ItemKNNCFRecommender(urm), + # 63: RP3betaRecommenderICM(urm, icm), + # 64: TopPop(urm), + # Recall@50 optimised models + 70: ItemKNNCFRecommender(urm), + 71: ItemKNN_CFCBF_Hybrid_Recommender(urm, icm), + 72: SLIMElasticNetRecommender(urm), + 73: RP3betaRecommenderICM(urm, icm), + } + + +if __name__ == "__main__": + from datetime import datetime + from concurrent.futures import ProcessPoolExecutor + + icm_df, urm_df = load_raw() + num_users = urm_df["user_id"].nunique() + num_items = urm_df["item_id"].nunique() + num_features = icm_df["feature_id"].nunique() + + icm = sps.csr_matrix( + (icm_df.data, (icm_df.item_id, icm_df.feature_id)), + shape=(num_items, num_features), + ) + + def train_fold(i, train_indices, icm, num_users, num_items, urm_df): + fold_dir = MODEL_DIR / str(i) + fold_dir.mkdir(exist_ok=True) + + fold_urm_df = urm_df.iloc[train_indices] + fold_urm = sps.csr_matrix( + (fold_urm_df["data"], (fold_urm_df["user_id"], fold_urm_df["item_id"])), + shape=(num_users, num_items), + ) + + fold_recommenders = recommender_factory(fold_urm, icm) + for j, (key, recommender) in enumerate(fold_recommenders.items()): + print( + f"Fold {str(i).zfill(2)} Recommender {str(j).zfill(2)} {datetime.now()}" + ) + recommender.fit(**HYPERPARAMETERS[key]) + with (fold_dir / f"{key}.pkl").open("wb") as f: + pickle.dump(recommender, f) + + with ProcessPoolExecutor(max_workers=cpu_count() // 2) as executor: + futures = [ + executor.submit( + train_fold, + i, + train_indices, + icm.copy(), + num_users, + num_items, + urm_df.copy(), + ) + for i, (train_indices, _) in enumerate( + KFold(N_FOLDS, shuffle=True, random_state=42).split(urm_df) + ) + ] + for future in futures: + future.result()