-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleanacsdata(1).py
More file actions
110 lines (85 loc) · 4.7 KB
/
cleanacsdata(1).py
File metadata and controls
110 lines (85 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# === Merge RUCC with FIPS and prepare for NaNDA variables ===
def merge_rucc_with_fips_and_prepare_nanda(rucc_path, output_path):
# Load cleaned RUCC dataset
rucc_df = pd.read_csv(rucc_path)
# Create County_State merge key
rucc_df['County_State'] = rucc_df['County_Name'].str.strip() + ', ' + rucc_df['State'].str.strip()
# Load FIPS lookup table
fips_url = "https://raw.githubusercontent.com/kjhealy/fips-codes/master/data/county_fips_master.csv"
fips_df = pd.read_csv(fips_url)
# Create matching merge key
fips_df['County_State'] = fips_df['name'] + ', ' + fips_df['state']
# Merge FIPS into RUCC
merged_df = rucc_df.merge(fips_df[['County_State', 'fips']], on='County_State', how='left')
merged_df.rename(columns={'fips': 'FIPS'}, inplace=True)
# Add placeholders for NaNDA variables
nanda_placeholders = ['Num_Grocery_Stores', 'Per_Capita_Grocery_SqFt', 'Nutrition_Score']
for col in nanda_placeholders:
merged_df[col] = pd.NA
# Save final output
merged_df.to_csv(output_path, index=False)
print(f"✅ Merged RUCC + FIPS file saved to {output_path} with NaNDA placeholders")
# === Add USDA_Desert Flag ===
desert_df = pd.read_csv("/Users/phinnmarkson/PycharmProjects/acs/random_county_sample_edit.csv", dtype=str)
if 'County' in desert_df.columns and 'USDA_Desert' in desert_df.columns:
# Strip whitespace and standardize county names
desert_df['County'] = desert_df['County'].str.strip()
merged_df['County_Name'] = merged_df['County_Name'].str.strip()
# Merge the USDA_Desert flag
merged_df = merged_df.merge(desert_df[['County', 'USDA_Desert']], left_on='County_Name', right_on='County', how='left')
merged_df.drop(columns='County', inplace=True)
print("✅ USDA_Desert flag added.")
else:
print("⚠️ USDA_Desert column not found in random_county_sample_edit.csv")
merged_df.to_csv(output_path, index=False)
"""
ACS Data Cleaning Script
Cleans the circus-like ACS data and creates random samples by income tertile
"""
import pandas as pd
import numpy as np
import os
def clean_acs_data(filepath):
# Load and transpose the data
df = pd.read_csv(filepath)
clean_df = df.T.reset_index()
clean_df.columns = ['Full_Label', 'Value']
# Split labels and pivot
clean_df[['County', 'Metric']] = clean_df['Full_Label'].str.split('!!', expand=True)
clean_df = clean_df.pivot(index='County', columns='Metric', values='Value').reset_index()
# Clean up and remove NaN column
clean_df = clean_df[['County', 'Estimate', 'Margin of Error']]
clean_df.columns = ['County', 'Income', 'MOE']
clean_df = clean_df[~clean_df['County'].str.contains('Label|United States')]
# Convert income to numeric
clean_df['Income'] = clean_df['Income'].str.replace(',', '').str.replace('±', '')
clean_df['Income'] = pd.to_numeric(clean_df['Income'], errors='coerce')
clean_df = clean_df.dropna(subset=['Income'])
return clean_df
def create_random_sample(clean_df, total_samples=100, random_seed=42):
# Create tertiles and random sample
clean_df['Tertile'] = pd.qcut(clean_df['Income'], q=3, labels=['Low', 'Medium', 'High'])
np.random.seed(random_seed)
n_low = 33 if (clean_df['Tertile'] == 'Low').sum() >= 33 else (clean_df['Tertile'] == 'Low').sum()
n_medium = 33 if (clean_df['Tertile'] == 'Medium').sum() >= 33 else (clean_df['Tertile'] == 'Medium').sum()
n_high = 34 if (clean_df['Tertile'] == 'High').sum() >= 34 else (clean_df['Tertile'] == 'High').sum()
low_sample = clean_df[clean_df['Tertile'] == 'Low'].sample(n=n_low, random_state=random_seed)
medium_sample = clean_df[clean_df['Tertile'] == 'Medium'].sample(n=n_medium, random_state=random_seed)
high_sample = clean_df[clean_df['Tertile'] == 'High'].sample(n=n_high, random_state=random_seed)
return pd.concat([low_sample, medium_sample, high_sample])
# Example usage
if __name__ == "__main__":
# Define the directory path for ACS data
dir_path = '/Users/phinnmarkson/Desktop/acs-data'
# === Dynamically find ACS file in the folder ===
acs_file = None
for f in os.listdir(dir_path):
if f.lower().endswith(".csv") and "acs" in f.lower():
acs_file = os.path.join(dir_path, f)
print(f"📂 Found ACS file: {acs_file}")
# Run full RUCC + FIPS + USDA_Desert merge
rucc_path = "/Users/phinnmarkson/PycharmProjects/acs/CopyofRUCC_StrataReady.csv"
output_path = os.path.join(dir_path, "merged_rucc_fips_usda.csv")
merge_rucc_with_fips_and_prepare_nanda(rucc_path, output_path)
print(f"📁 Full RUCC + FIPS + USDA merge completed. Output saved to {output_path}")
break