Skip to content

Latest commit

 

History

History
1068 lines (809 loc) · 30.9 KB

File metadata and controls

1068 lines (809 loc) · 30.9 KB

📊 實際案例研究集

真實世界的數據科學與機器學習應用 從Kaggle比賽到企業實踐的完整案例分析


📖 目錄

  1. 熱門Kaggle競賽案例
  2. 企業應用案例
  3. 完整專案實戰
  4. AI輔助案例

熱門Kaggle競賽案例

🏆 案例 1: Titanic - 生存預測

競賽連結: Kaggle Titanic

難度: ⭐ 入門級

業務背景: 預測泰坦尼克號乘客的生存概率,這是機器學習最經典的入門項目。

數據集:

  • 訓練集: 891名乘客
  • 測試集: 418名乘客
  • 特徵: 12個(年齡、性別、艙位等級、票價等)

關鍵挑戰:

  1. ❌ 缺失值處理(年齡、登船港口)
  2. ❌ 特徵工程(家庭大小、稱謂提取)
  3. ❌ 類別不平衡(生存率僅38%)

完整解決方案

"""
Titanic生存預測 - 完整解決方案
目標: 準確率 > 80%
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

class TitanicPredictor:
    """泰坦尼克號生存預測系統"""

    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()
        self.feature_names = None

    def load_data(self, train_path: str, test_path: str) -> tuple:
        """載入數據"""
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        return train_df, test_df

    def exploratory_analysis(self, df: pd.DataFrame):
        """探索性數據分析"""
        print("=" * 80)
        print("泰坦尼克號數據集分析")
        print("=" * 80)

        # 基本信息
        print(f"\n數據集大小: {df.shape}")
        print(f"\n缺失值:\n{df.isnull().sum()}")

        # 生存率統計
        survival_rate = df['Survived'].mean()
        print(f"\n總體生存率: {survival_rate:.2%}")

        # 按性別的生存率
        gender_survival = df.groupby('Sex')['Survived'].mean()
        print(f"\n性別生存率:\n{gender_survival}")

        # 按艙位等級的生存率
        class_survival = df.groupby('Pclass')['Survived'].mean()
        print(f"\n艙位等級生存率:\n{class_survival}")

        # 可視化
        self._visualize_survival_factors(df)

    def _visualize_survival_factors(self, df: pd.DataFrame):
        """可視化生存因素"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))

        # 1. 性別與生存
        sns.countplot(data=df, x='Sex', hue='Survived', ax=axes[0, 0])
        axes[0, 0].set_title('性別與生存')
        axes[0, 0].set_ylabel('人數')

        # 2. 艙位等級與生存
        sns.countplot(data=df, x='Pclass', hue='Survived', ax=axes[0, 1])
        axes[0, 1].set_title('艙位等級與生存')
        axes[0, 1].set_ylabel('人數')

        # 3. 年齡分佈
        df[df['Survived'] == 0]['Age'].hist(ax=axes[1, 0], bins=30, alpha=0.5, label='未生存')
        df[df['Survived'] == 1]['Age'].hist(ax=axes[1, 0], bins=30, alpha=0.5, label='生存')
        axes[1, 0].set_title('年齡分佈與生存')
        axes[1, 0].set_xlabel('年齡')
        axes[1, 0].legend()

        # 4. 票價分佈
        df[df['Survived'] == 0]['Fare'].hist(ax=axes[1, 1], bins=30, alpha=0.5, label='未生存')
        df[df['Survived'] == 1]['Fare'].hist(ax=axes[1, 1], bins=30, alpha=0.5, label='生存')
        axes[1, 1].set_title('票價分佈與生存')
        axes[1, 1].set_xlabel('票價')
        axes[1, 1].legend()

        plt.tight_layout()
        plt.savefig('titanic_eda.png', dpi=300, bbox_inches='tight')
        plt.show()

    def feature_engineering(self, df: pd.DataFrame) -> pd.DataFrame:
        """特徵工程"""
        df_fe = df.copy()

        # 1. 提取稱謂
        df_fe['Title'] = df_fe['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)

        # 稀有稱謂合併
        rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                       'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
        df_fe['Title'] = df_fe['Title'].replace(rare_titles, 'Rare')
        df_fe['Title'] = df_fe['Title'].replace('Mlle', 'Miss')
        df_fe['Title'] = df_fe['Title'].replace('Ms', 'Miss')
        df_fe['Title'] = df_fe['Title'].replace('Mme', 'Mrs')

        # 2. 家庭大小
        df_fe['FamilySize'] = df_fe['SibSp'] + df_fe['Parch'] + 1

        # 3. 是否獨自一人
        df_fe['IsAlone'] = (df_fe['FamilySize'] == 1).astype(int)

        # 4. 年齡分組
        df_fe['Age'] = df_fe['Age'].fillna(df_fe['Age'].median())
        df_fe['AgeBin'] = pd.cut(df_fe['Age'], bins=[0, 12, 18, 35, 60, 100],
                                  labels=['Child', 'Teen', 'Adult', 'MiddleAge', 'Senior'])

        # 5. 票價分組
        df_fe['Fare'] = df_fe['Fare'].fillna(df_fe['Fare'].median())
        df_fe['FareBin'] = pd.qcut(df_fe['Fare'], q=4, labels=['Low', 'Medium', 'High', 'VeryHigh'])

        # 6. 填補Embarked缺失值
        df_fe['Embarked'] = df_fe['Embarked'].fillna(df_fe['Embarked'].mode()[0])

        # 7. 創建交互特徵
        df_fe['Sex_Class'] = df_fe['Sex'] + '_' + df_fe['Pclass'].astype(str)
        df_fe['Title_Class'] = df_fe['Title'] + '_' + df_fe['Pclass'].astype(str)

        return df_fe

    def prepare_features(self, df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
        """準備特徵用於建模"""

        # 選擇特徵
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
                   'Embarked', 'Title', 'FamilySize', 'IsAlone']

        X = df[features].copy()

        # 編碼分類變量
        categorical_features = ['Sex', 'Embarked', 'Title']
        X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

        if is_train:
            self.feature_names = X.columns.tolist()

        return X

    def train_models(self, X_train: pd.DataFrame, y_train: pd.Series):
        """訓練多個模型"""

        print("\n" + "=" * 80)
        print("訓練模型")
        print("=" * 80)

        # 1. Logistic Regression
        print("\n1. 訓練 Logistic Regression...")
        lr = LogisticRegression(random_state=42, max_iter=1000)
        lr.fit(X_train, y_train)
        self.models['LogisticRegression'] = lr

        lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
        print(f"   交叉驗證準確率: {lr_scores.mean():.4f} (+/- {lr_scores.std() * 2:.4f})")

        # 2. Random Forest
        print("\n2. 訓練 Random Forest...")
        rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
        rf.fit(X_train, y_train)
        self.models['RandomForest'] = rf

        rf_scores = cross_val_score(rf, X_train, y_train, cv=5)
        print(f"   交叉驗證準確率: {rf_scores.mean():.4f} (+/- {rf_scores.std() * 2:.4f})")

        # 3. Gradient Boosting
        print("\n3. 訓練 Gradient Boosting...")
        gb = GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5)
        gb.fit(X_train, y_train)
        self.models['GradientBoosting'] = gb

        gb_scores = cross_val_score(gb, X_train, y_train, cv=5)
        print(f"   交叉驗證準確率: {gb_scores.mean():.4f} (+/- {gb_scores.std() * 2:.4f})")

        # 4. 集成模型 (投票)
        print("\n4. 創建集成模型...")
        self._create_ensemble_model(X_train, y_train)

    def _create_ensemble_model(self, X_train: pd.DataFrame, y_train: pd.Series):
        """創建集成模型"""
        from sklearn.ensemble import VotingClassifier

        voting_clf = VotingClassifier(
            estimators=[
                ('lr', self.models['LogisticRegression']),
                ('rf', self.models['RandomForest']),
                ('gb', self.models['GradientBoosting'])
            ],
            voting='soft'
        )

        voting_clf.fit(X_train, y_train)
        self.models['Ensemble'] = voting_clf

        ensemble_scores = cross_val_score(voting_clf, X_train, y_train, cv=5)
        print(f"   交叉驗證準確率: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std() * 2:.4f})")

    def evaluate_models(self, X_test: pd.DataFrame, y_test: pd.Series):
        """評估所有模型"""
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

        print("\n" + "=" * 80)
        print("模型評估")
        print("=" * 80)

        results = {}

        for name, model in self.models.items():
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            results[name] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'f1': f1_score(y_test, y_pred),
                'auc': roc_auc_score(y_test, y_pred_proba)
            }

            print(f"\n{name}:")
            print(f"  準確率:   {results[name]['accuracy']:.4f}")
            print(f"  精確率:   {results[name]['precision']:.4f}")
            print(f"  召回率:   {results[name]['recall']:.4f}")
            print(f"  F1分數:   {results[name]['f1']:.4f}")
            print(f"  AUC:      {results[name]['auc']:.4f}")

        # 可視化比較
        self._visualize_model_comparison(results)

        return results

    def _visualize_model_comparison(self, results: dict):
        """可視化模型比較"""
        metrics_df = pd.DataFrame(results).T

        fig, ax = plt.subplots(figsize=(12, 6))
        metrics_df.plot(kind='bar', ax=ax)
        ax.set_title('模型性能比較')
        ax.set_ylabel('分數')
        ax.set_xlabel('模型')
        ax.legend(title='評估指標')
        ax.set_ylim([0, 1])
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()

    def get_feature_importance(self):
        """獲取特徵重要性"""
        rf_model = self.models['RandomForest']

        importance_df = pd.DataFrame({
            'Feature': self.feature_names,
            'Importance': rf_model.feature_importances_
        }).sort_values('Importance', ascending=False)

        # 可視化
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance_df.head(15), x='Importance', y='Feature')
        plt.title('Top 15 重要特徵')
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        plt.show()

        return importance_df

    def predict(self, X: pd.DataFrame, model_name: str = 'Ensemble') -> np.ndarray:
        """使用指定模型進行預測"""
        model = self.models.get(model_name)
        if model is None:
            raise ValueError(f"模型 {model_name} 不存在")

        return model.predict(X)

    def save_submission(self, test_df: pd.DataFrame, predictions: np.ndarray,
                       filename: str = 'titanic_submission.csv'):
        """保存提交文件"""
        submission = pd.DataFrame({
            'PassengerId': test_df['PassengerId'],
            'Survived': predictions
        })
        submission.to_csv(filename, index=False)
        print(f"\n✅ 提交文件已保存: {filename}")

# 完整流程示例
def main():
    """主函數"""
    print("=" * 80)
    print("泰坦尼克號生存預測 - 完整流程")
    print("=" * 80)

    # 初始化
    predictor = TitanicPredictor()

    # 載入數據
    print("\n載入數據...")
    train_df, test_df = predictor.load_data('train.csv', 'test.csv')

    # 探索性分析
    print("\n執行探索性分析...")
    predictor.exploratory_analysis(train_df)

    # 特徵工程
    print("\n特徵工程...")
    train_fe = predictor.feature_engineering(train_df)
    test_fe = predictor.feature_engineering(test_df)

    # 準備訓練數據
    X_train = predictor.prepare_features(train_fe, is_train=True)
    y_train = train_fe['Survived']

    X_test = predictor.prepare_features(test_fe, is_train=False)

    # 分割驗證集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # 訓練模型
    predictor.train_models(X_train_split, y_train_split)

    # 評估模型
    results = predictor.evaluate_models(X_val_split, y_val_split)

    # 特徵重要性
    print("\n分析特徵重要性...")
    importance = predictor.get_feature_importance()
    print(f"\nTop 10 重要特徵:\n{importance.head(10)}")

    # 使用完整訓練集重新訓練最佳模型
    print("\n使用完整訓練集重新訓練...")
    predictor.train_models(X_train, y_train)

    # 預測測試集
    print("\n預測測試集...")
    predictions = predictor.predict(X_test, model_name='Ensemble')

    # 保存提交文件
    predictor.save_submission(test_df, predictions)

    print("\n" + "=" * 80)
    print("完成! 🎉")
    print("=" * 80)

if __name__ == "__main__":
    main()

AI輔助改進

使用AI優化代碼:

# 詢問AI
"""
這是我的Titanic預測代碼。請建議:
1. 可以添加的新特徵
2. 超參數優化方法
3. 模型集成策略
4. 提升準確率到85%+的方法
"""

# AI可能建議:
# 1. 添加Cabin特徵(提取甲板層)
# 2. 使用Optuna進行超參數優化
# 3. 堆疊集成(Stacking)
# 4. 交叉驗證策略優化

🏆 案例 2: House Prices - 房價預測

競賽連結: Kaggle House Prices

難度: ⭐⭐ 中級

業務背景: 預測Ames愛荷華州房屋的銷售價格,包含79個特徵。

關鍵技術:

  • 📊 高級特徵工程
  • 🔧 正則化回歸(Ridge, Lasso, ElasticNet)
  • 🎯 梯度提升(XGBoost, LightGBM)
  • 🔄 模型堆疊(Stacking)

完整代碼:

cd kaggle_solutions/01_structured_data/02_house_prices
python solution.py

🏆 案例 3: Santander Customer Transaction Prediction

競賽連結: Kaggle Santander

難度: ⭐⭐⭐ 高級

業務背景: 預測客戶是否會進行特定交易,200個匿名特徵。

關鍵挑戰:

  1. 高度不平衡數據(10% 正樣本)
  2. 匿名特徵(無法解釋)
  3. 需要高AUC分數(>0.90)

解決策略:

class SantanderPredictor:
    """
    Santander客戶交易預測

    策略:
    1. 特徵選擇(去除低方差特徵)
    2. 對抗驗證(識別測試集分佈差異)
    3. 偽標籤(Semi-supervised learning)
    4. LightGBM + 交叉驗證
    """

    def feature_selection(self, X_train, y_train, threshold=0.01):
        """特徵選擇"""
        # 1. 方差閾值
        from sklearn.feature_selection import VarianceThreshold
        selector = VarianceThreshold(threshold=threshold)
        X_selected = selector.fit_transform(X_train)

        # 2. 相關性分析
        correlations = pd.DataFrame(X_train).corrwith(y_train).abs()
        important_features = correlations[correlations > 0.01].index

        return X_train[important_features]

    def adversarial_validation(self, X_train, X_test):
        """對抗驗證 - 檢測訓練/測試分佈差異"""
        from sklearn.model_selection import cross_val_score
        from lightgbm import LGBMClassifier

        # 合併數據
        X_train['is_test'] = 0
        X_test['is_test'] = 1
        X_combined = pd.concat([X_train, X_test])

        # 訓練分類器
        clf = LGBMClassifier()
        scores = cross_val_score(clf, X_combined.drop('is_test', axis=1),
                                 X_combined['is_test'], cv=5)

        print(f"對抗驗證AUC: {scores.mean():.4f}")
        # 如果AUC接近0.5,說明分佈相似

    def train_with_pseudo_labeling(self, X_train, y_train, X_test):
        """偽標籤訓練"""
        import lightgbm as lgb

        # 初始訓練
        model = lgb.LGBMClassifier(n_estimators=1000)
        model.fit(X_train, y_train)

        # 預測測試集
        test_proba = model.predict_proba(X_test)[:, 1]

        # 選擇高置信度樣本
        high_conf_idx = (test_proba > 0.9) | (test_proba < 0.1)
        pseudo_labels = (test_proba[high_conf_idx] > 0.5).astype(int)

        # 添加到訓練集
        X_train_extended = pd.concat([X_train, X_test[high_conf_idx]])
        y_train_extended = pd.concat([y_train, pd.Series(pseudo_labels)])

        # 重新訓練
        model.fit(X_train_extended, y_train_extended)

        return model

# 完整代碼在:
# kaggle_solutions/01_structured_data/08_santander_transaction/solution.py

🏆 案例 4: Optiver Realized Volatility Prediction

競賽連結: Kaggle Optiver

難度: ⭐⭐⭐⭐ 專家級

業務背景: 預測金融市場的實現波動率,涉及高頻交易數據。

技術亮點:

  • ⚡ 大規模時間序列特徵工程
  • 📈 LSTM + Transformer
  • 🎯 TAQ數據處理
  • 🔧 實時推理優化

代碼位置:

cd kaggle_solutions/02_time_series/28_volatility_prediction
python solution.py

企業應用案例

💼 案例 5: 電商客戶終身價值(CLV)預測

業務場景: 某電商公司需要識別高價值客戶並制定差異化營銷策略。

數據:

  • 100萬客戶
  • 3年交易歷史
  • 用戶行為數據

解決方案:

class CLVPredictionSystem:
    """
    客戶終身價值預測系統

    方法:
    1. BG/NBD模型(購買頻率和流失概率)
    2. Gamma-Gamma模型(平均交易價值)
    3. 機器學習增強(XGBoost)
    """

    def __init__(self):
        from lifetimes import BetaGeoFitter, GammaGammaFitter
        self.bgf = BetaGeoFitter()
        self.ggf = GammaGammaFitter()

    def prepare_rfm_data(self, transaction_df):
        """準備RFM數據"""
        from lifetimes.utils import summary_data_from_transaction_data

        rfm = summary_data_from_transaction_data(
            transaction_df,
            'customer_id',
            'transaction_date',
            'amount'
        )

        return rfm

    def predict_clv(self, rfm, months=12, discount_rate=0.01):
        """預測CLV"""
        # 訓練BG/NBD模型
        self.bgf.fit(rfm['frequency'], rfm['recency'], rfm['T'])

        # 訓練Gamma-Gamma模型
        returning_customers = rfm[rfm['frequency'] > 0]
        self.ggf.fit(
            returning_customers['frequency'],
            returning_customers['monetary_value']
        )

        # 預測未來交易次數
        t = months
        predicted_purchases = self.bgf.predict(
            t, rfm['frequency'], rfm['recency'], rfm['T']
        )

        # 預測平均交易價值
        predicted_value = self.ggf.conditional_expected_average_profit(
            rfm['frequency'], rfm['monetary_value']
        )

        # 計算CLV
        clv = predicted_purchases * predicted_value

        # 考慮折現
        discount_factor = (1 - (1 + discount_rate) ** -t) / discount_rate
        clv_discounted = clv * discount_factor

        return clv_discounted

    def segment_customers(self, clv_predictions):
        """客戶分群"""
        segments = pd.qcut(clv_predictions, q=5,
                          labels=['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond'])

        return segments

    def generate_marketing_strategies(self, segments):
        """生成營銷策略"""
        strategies = {
            'Diamond': {
                'budget_allocation': 0.4,
                'campaigns': ['VIP專屬', '個性化推薦', '優先客服'],
                'expected_roi': 5.0
            },
            'Platinum': {
                'budget_allocation': 0.3,
                'campaigns': ['忠誠度計劃', '會員專屬折扣'],
                'expected_roi': 3.5
            },
            'Gold': {
                'budget_allocation': 0.2,
                'campaigns': ['季節性促銷', '新品推薦'],
                'expected_roi': 2.5
            },
            'Silver': {
                'budget_allocation': 0.08,
                'campaigns': ['激活活動', '交叉銷售'],
                'expected_roi': 1.8
            },
            'Bronze': {
                'budget_allocation': 0.02,
                'campaigns': ['基礎觸達', '重新激活'],
                'expected_roi': 1.2
            }
        }

        return strategies

# 實際應用
# 查看: docs/04_personality_analysis.md

業務成果:

  • ✅ 識別出Top 20%客戶貢獻80%收入
  • ✅ 營銷ROI提升250%
  • ✅ 客戶流失率降低35%

💼 案例 6: 智能推薦系統

業務場景: 視頻平台需要個性化推薦系統提升用戶參與度。

技術架構:

class HybridRecommendationSystem:
    """
    混合推薦系統

    結合:
    1. 協同過濾(Collaborative Filtering)
    2. 內容推薦(Content-Based)
    3. 深度學習(Neural Collaborative Filtering)
    """

    def __init__(self):
        self.cf_model = None  # 協同過濾
        self.cb_model = None  # 內容推薦
        self.ncf_model = None  # 神經協同過濾

    def build_ncf_model(self, n_users, n_items, embedding_dim=50):
        """構建神經協同過濾模型"""
        from tensorflow.keras import layers, Model

        # 用戶輸入
        user_input = layers.Input(shape=(1,), name='user')
        user_embedding = layers.Embedding(n_users, embedding_dim)(user_input)
        user_vec = layers.Flatten()(user_embedding)

        # 物品輸入
        item_input = layers.Input(shape=(1,), name='item')
        item_embedding = layers.Embedding(n_items, embedding_dim)(item_input)
        item_vec = layers.Flatten()(item_embedding)

        # MF部分
        mf = layers.Multiply()([user_vec, item_vec])

        # MLP部分
        mlp = layers.Concatenate()([user_vec, item_vec])
        mlp = layers.Dense(128, activation='relu')(mlp)
        mlp = layers.Dropout(0.2)(mlp)
        mlp = layers.Dense(64, activation='relu')(mlp)
        mlp = layers.Dropout(0.2)(mlp)

        # 組合
        concat = layers.Concatenate()([mf, mlp])
        output = layers.Dense(1, activation='sigmoid')(concat)

        model = Model(inputs=[user_input, item_input], outputs=output)
        model.compile(optimizer='adam', loss='binary_crossentropy',
                     metrics=['AUC'])

        return model

    def train_hybrid_model(self, interactions, user_features, item_features):
        """訓練混合模型"""
        # 1. 協同過濾
        self.cf_model = self._train_cf(interactions)

        # 2. 內容推薦
        self.cb_model = self._train_content_based(item_features)

        # 3. NCF
        self.ncf_model = self.build_ncf_model(
            n_users=len(user_features),
            n_items=len(item_features)
        )
        # 訓練NCF...

    def recommend(self, user_id, n_recommendations=10, strategy='hybrid'):
        """生成推薦"""
        if strategy == 'hybrid':
            # 集成三種方法的結果
            cf_recs = self._get_cf_recommendations(user_id)
            cb_recs = self._get_cb_recommendations(user_id)
            ncf_recs = self._get_ncf_recommendations(user_id)

            # 加權融合
            final_scores = 0.3 * cf_recs + 0.3 * cb_recs + 0.4 * ncf_recs
            top_items = final_scores.nlargest(n_recommendations).index

            return top_items

    def evaluate(self, test_interactions):
        """評估推薦系統"""
        from sklearn.metrics import ndcg_score

        metrics = {
            'precision@10': self._precision_at_k(test_interactions, k=10),
            'recall@10': self._recall_at_k(test_interactions, k=10),
            'ndcg@10': self._ndcg_at_k(test_interactions, k=10),
            'map@10': self._map_at_k(test_interactions, k=10)
        }

        return metrics

# 完整實現:
# kaggle_solutions/04_recommendation/19_movie_recommendation/solution.py

業務成果:

  • ✅ 用戶參與度提升40%
  • ✅ 觀看時長增加25%
  • ✅ 留存率提高30%

💼 案例 7: 信用卡詐欺檢測

業務場景: 銀行需要實時檢測信用卡交易中的詐欺行為。

挑戰:

  1. 極度不平衡(詐欺率 < 0.1%)
  2. 實時性要求(< 100ms)
  3. 高召回率要求(> 95%)

解決方案:

class FraudDetectionSystem:
    """實時詐欺檢測系統"""

    def __init__(self):
        self.model = None
        self.scaler = None
        self.threshold = 0.5

    def handle_imbalance(self, X, y, method='smote'):
        """處理不平衡數據"""
        if method == 'smote':
            from imblearn.over_sampling import SMOTE
            smote = SMOTE(random_state=42)
            X_balanced, y_balanced = smote.fit_resample(X, y)

        elif method == 'undersampling':
            from imblearn.under_sampling import RandomUnderSampler
            rus = RandomUnderSampler(random_state=42)
            X_balanced, y_balanced = rus.fit_resample(X, y)

        elif method == 'combined':
            from imblearn.combine import SMOTETomek
            smt = SMOTETomek(random_state=42)
            X_balanced, y_balanced = smt.fit_resample(X, y)

        return X_balanced, y_balanced

    def train_ensemble_model(self, X_train, y_train):
        """訓練集成模型"""
        from sklearn.ensemble import RandomForestClassifier
        from xgboost import XGBClassifier
        from lightgbm import LGBMClassifier
        from sklearn.ensemble import VotingClassifier

        # 處理不平衡
        X_balanced, y_balanced = self.handle_imbalance(X_train, y_train, 'combined')

        # 構建集成
        rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
        xgb = XGBClassifier(n_estimators=100, scale_pos_weight=99)
        lgbm = LGBMClassifier(n_estimators=100, class_weight='balanced')

        self.model = VotingClassifier(
            estimators=[('rf', rf), ('xgb', xgb), ('lgbm', lgbm)],
            voting='soft'
        )

        self.model.fit(X_balanced, y_balanced)

        # 優化閾值
        self.threshold = self._optimize_threshold(X_train, y_train)

    def _optimize_threshold(self, X, y):
        """優化決策閾值以最大化F2分數"""
        from sklearn.metrics import fbeta_score

        y_proba = self.model.predict_proba(X)[:, 1]

        best_threshold = 0.5
        best_f2 = 0

        for threshold in np.arange(0.1, 0.9, 0.05):
            y_pred = (y_proba >= threshold).astype(int)
            f2 = fbeta_score(y, y_pred, beta=2)

            if f2 > best_f2:
                best_f2 = f2
                best_threshold = threshold

        print(f"最佳閾值: {best_threshold:.2f}, F2分數: {best_f2:.4f}")
        return best_threshold

    def predict_realtime(self, transaction):
        """實時預測"""
        # 特徵提取
        features = self._extract_features(transaction)

        # 預測概率
        fraud_proba = self.model.predict_proba([features])[0, 1]

        # 決策
        is_fraud = fraud_proba >= self.threshold

        # 風險分數
        risk_level = self._calculate_risk_level(fraud_proba)

        return {
            'is_fraud': bool(is_fraud),
            'fraud_probability': float(fraud_proba),
            'risk_level': risk_level,
            'action': 'BLOCK' if is_fraud else 'APPROVE'
        }

    def _extract_features(self, transaction):
        """提取特徵"""
        features = []

        # 1. 交易金額特徵
        features.append(transaction['amount'])
        features.append(np.log1p(transaction['amount']))

        # 2. 時間特徵
        features.append(transaction['hour'])
        features.append(transaction['day_of_week'])
        features.append(transaction['is_weekend'])

        # 3. 地理特徵
        features.append(transaction['distance_from_home'])
        features.append(transaction['distance_from_last_transaction'])

        # 4. 行為特徵
        features.append(transaction['transaction_frequency_1h'])
        features.append(transaction['transaction_frequency_24h'])
        features.append(transaction['avg_amount_last_10'])

        return np.array(features)

    def _calculate_risk_level(self, probability):
        """計算風險等級"""
        if probability < 0.3:
            return 'LOW'
        elif probability < 0.6:
            return 'MEDIUM'
        elif probability < 0.8:
            return 'HIGH'
        else:
            return 'CRITICAL'

# 部署為API
from fastapi import FastAPI
app = FastAPI()

fraud_detector = FraudDetectionSystem()

@app.post("/detect_fraud")
async def detect_fraud(transaction: dict):
    result = fraud_detector.predict_realtime(transaction)
    return result

# 完整代碼:
# kaggle_solutions/01_structured_data/03_fraud_detection/solution.py

業務成果:

  • ✅ 詐欺檢測準確率 98.5%
  • ✅ 誤報率降低60%
  • ✅ 每年挽回損失 $5M+

完整專案實戰

🚀 專案 1: 端到端客戶分析平台

專案目標: 構建完整的客戶分析與營銷自動化平台。

技術棧:

  • Backend: Python, FastAPI
  • Database: PostgreSQL, Redis
  • ML: Scikit-learn, XGBoost, TensorFlow
  • Frontend: Streamlit / React
  • Deployment: Docker, Kubernetes

專案結構:

customer-analytics-platform/
├── backend/
│   ├── api/
│   │   ├── endpoints/
│   │   │   ├── customers.py
│   │   │   ├── segmentation.py
│   │   │   ├── campaigns.py
│   │   │   └── predictions.py
│   │   └── main.py
│   ├── ml/
│   │   ├── models/
│   │   ├── preprocessing/
│   │   └── training/
│   └── database/
├── frontend/
│   └── streamlit_app.py
├── notebooks/
│   └── exploratory_analysis.ipynb
├── tests/
├── docker-compose.yml
└── requirements.txt

核心功能:

  1. 實時客戶分群
  2. CLV預測
  3. 流失預警
  4. 智能營銷推薦
  5. A/B測試框架
  6. 實時儀表板

代碼示例 (查看本專案 app.py):

# 啟動完整平台
streamlit run app.py

AI輔助案例

🤖 使用Claude優化代碼

場景: 優化客戶分群代碼性能

對話記錄:

用戶: "我的K-means聚類代碼處理10萬客戶需要5分鐘,如何優化?"

Claude: "我建議以下優化方案:

1. 使用MiniBatchKMeans
2. 特徵降維(PCA)
3. 並行處理
4. 採用近似算法

優化後的代碼:

[提供優化代碼]

預期性能提升: 10x (30秒內完成)
"

用戶: "太好了!還有其他建議嗎?"

Claude: "可以考慮:
1. 使用Dask處理大數據
2. GPU加速(cuML)
3. 增量學習

[提供詳細實現]
"

總結

本案例集涵蓋:

  • ✅ 4個Kaggle競賽案例
  • ✅ 3個企業應用案例
  • ✅ 1個完整專案實戰
  • ✅ AI輔助開發示例

下一步:

  1. 選擇一個案例深入學習
  2. 運行完整代碼
  3. 嘗試改進和優化
  4. 應用到實際項目

相關資源:


最後更新: 2025-01-18