-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpair_selection.py
More file actions
93 lines (62 loc) · 2.56 KB
/
pair_selection.py
File metadata and controls
93 lines (62 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
dir = os.listdir("data")
# Create a list of stock names, exclude output files
stock_names = [filename.replace(".csv", "") for filename in dir
if filename.endswith(".csv")
and "final_pairs" not in filename
and "combined_df" not in filename]
#create a dictionary to store stock data, and reference it by stock name
stock_data = {}
for ticker in stock_names:
try:
df = pd.read_csv(f"data/{ticker}.csv")
# Reset index to access rows properly
df = df.reset_index()
# Keep only rows that have valid dates (skip header rows)
df = df[df['index'] > 1]
# Select only the price=date and close cols
temp = df[['Price', 'Close']]
temp = temp.dropna()
temp["Price"] = pd.to_datetime(temp["Price"])
temp = temp.set_index("Price").sort_index()
stock_data[ticker] = temp["Close"].astype(float).rename(ticker)
except Exception as e:
print(f"Error loading {ticker}: {e}")
#LODHA.NS has data from 2021, so we would remove it from stock_data
if "LODHA.NS" in stock_data:
del stock_data["LODHA.NS"]
stock_names.remove("LODHA.NS")
#combining all series into a single DataFrame
combined_df = pd.concat(stock_data.values(), axis=1, join="inner")
#set column names using ticker list
combined_df.columns = stock_data.keys()
#sort by date
combined_df = combined_df.sort_index()
# Save combined_df to CSV
combined_df.to_csv("data/combined_df.csv")
results = []
tickers = list(combined_df.columns)
for i in range(len(tickers)):
for j in range(i+1, len(tickers)):
stock_a = tickers[i]
stock_b = tickers[j]
series_a = combined_df[stock_a]
series_b = combined_df[stock_b]
#run the engle-granger cointegration test
try:
_, p_value, _ = coint(series_a, series_b)
results.append((stock_a, stock_b, p_value))
except Exception as e:
print(f"Skipped {stock_a} and {stock_b}: {e}")
#convert to a DataFrame
pval_df = pd.DataFrame(results, columns=["Stock A", "Stock B", "p-value"])
#filter based on p-value threshold, in our case it is 0.05
final_df = pval_df[pval_df["p-value"] < 0.05]
#sort by p-value
final_df = final_df.sort_values(by="p-value")
#saving the final pairs into a csv file
final_df.to_csv("data/final_pairs.csv", index=False)