forked from made-mlops-2022/Made-ML12-Masaeva-Olga
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_pipeline.py
More file actions
75 lines (58 loc) · 2.58 KB
/
train_pipeline.py
File metadata and controls
75 lines (58 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import logging
import os
import sys
from pathlib import Path
import click
import pandas as pd
from data.make_dataset import read_data, split_train_val_data, save_data
from enities.train_pipeline_params import (
TrainingPipelineParams,
read_training_pipeline_params,
)
from features.build_features import extract_target, drop_features
from model_gen.model_fit import(
train_model,
serialize_model,
evaluate_model,
)
from custom_logs.log_decorator import log
from model_gen.model_predict import predict_model
logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
logger.setLevel(logging.INFO)
logger.addHandler(handler)
@log
def train_pipeline(config_path: str):
training_pipeline_params = read_training_pipeline_params(config_path)
result = run_train_pipeline(training_pipeline_params)
return result
@log
def run_train_pipeline(training_pipeline_params):
logger.info(f"start train pipeline with params{training_pipeline_params}")
data = read_data(training_pipeline_params.input_data_path)
logger.info(f"data.shape is {data.shape}")
train_df, val_df = split_train_val_data(data, training_pipeline_params.splitting_params)
val_target = extract_target(val_df, training_pipeline_params.feature_params)
train_target = extract_target(train_df, training_pipeline_params.feature_params)
train_df = train_df.drop(columns=training_pipeline_params.feature_params.target_col)
val_df = val_df.drop(columns=training_pipeline_params.feature_params.target_col)
train_df = drop_features(train_df, training_pipeline_params.feature_params)
val_df = drop_features(val_df, training_pipeline_params.feature_params)
save_data(val_df, training_pipeline_params.validation_dataset_path)
logger.info(f"train_df.shape is {train_df.shape}")
logger.info(f"val_df.shape is {val_df.shape}")
model = train_model( train_df, train_target, training_pipeline_params.train_params )
predicts = predict_model(model, val_df)
metrics = evaluate_model(predicts, val_target)
with open(training_pipeline_params.metric_path, "w+") as metric_file:
json.dump(metrics.toJSON(), metric_file)
logger.info(f"metrics are {metrics}")
path_to_model = serialize_model(model, training_pipeline_params.output_model_path)
return path_to_model, metrics
@click.command(name="train_pipeline")
@click.argument("config_path")
def train_pipeline_command(config_path: str):
train_pipeline(config_path)
if __name__ == "__main__":
train_pipeline_command()