github.com_phinnphace_FoodAccessVulnerability/cleanacsdata(1).py at main · phinnphace/github.com_phinnphace_FoodAccessVulnerability · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# === Merge RUCC with FIPS and prepare for NaNDA variables ===
def merge_rucc_with_fips_and_prepare_nanda(rucc_path, output_path):
    # Load cleaned RUCC dataset
    rucc_df = pd.read_csv(rucc_path)

    # Create County_State merge key
    rucc_df['County_State'] = rucc_df['County_Name'].str.strip() + ', ' + rucc_df['State'].str.strip()

    # Load FIPS lookup table
    fips_url = "https://raw.githubusercontent.com/kjhealy/fips-codes/master/data/county_fips_master.csv"
    fips_df = pd.read_csv(fips_url)

    # Create matching merge key
    fips_df['County_State'] = fips_df['name'] + ', ' + fips_df['state']

    # Merge FIPS into RUCC
    merged_df = rucc_df.merge(fips_df[['County_State', 'fips']], on='County_State', how='left')
    merged_df.rename(columns={'fips': 'FIPS'}, inplace=True)

    # Add placeholders for NaNDA variables
    nanda_placeholders = ['Num_Grocery_Stores', 'Per_Capita_Grocery_SqFt', 'Nutrition_Score']
    for col in nanda_placeholders:
        merged_df[col] = pd.NA

    # Save final output
    merged_df.to_csv(output_path, index=False)
    print(f"✅ Merged RUCC + FIPS file saved to {output_path} with NaNDA placeholders")

    # === Add USDA_Desert Flag ===
    desert_df = pd.read_csv("/Users/phinnmarkson/PycharmProjects/acs/random_county_sample_edit.csv", dtype=str)
    if 'County' in desert_df.columns and 'USDA_Desert' in desert_df.columns:
        # Strip whitespace and standardize county names
        desert_df['County'] = desert_df['County'].str.strip()
        merged_df['County_Name'] = merged_df['County_Name'].str.strip()

        # Merge the USDA_Desert flag
        merged_df = merged_df.merge(desert_df[['County', 'USDA_Desert']], left_on='County_Name', right_on='County', how='left')
        merged_df.drop(columns='County', inplace=True)
        print("✅ USDA_Desert flag added.")
    else:
        print("⚠️ USDA_Desert column not found in random_county_sample_edit.csv")

    merged_df.to_csv(output_path, index=False)


"""
ACS Data Cleaning Script
Cleans the circus-like ACS data and creates random samples by income tertile
"""

import pandas as pd
import numpy as np
import os


def clean_acs_data(filepath):
    # Load and transpose the data
    df = pd.read_csv(filepath)
    clean_df = df.T.reset_index()
    clean_df.columns = ['Full_Label', 'Value']

    # Split labels and pivot
    clean_df[['County', 'Metric']] = clean_df['Full_Label'].str.split('!!', expand=True)
    clean_df = clean_df.pivot(index='County', columns='Metric', values='Value').reset_index()

    # Clean up and remove NaN column
    clean_df = clean_df[['County', 'Estimate', 'Margin of Error']]
    clean_df.columns = ['County', 'Income', 'MOE']
    clean_df = clean_df[~clean_df['County'].str.contains('Label|United States')]

    # Convert income to numeric
    clean_df['Income'] = clean_df['Income'].str.replace(',', '').str.replace('±', '')
    clean_df['Income'] = pd.to_numeric(clean_df['Income'], errors='coerce')
    clean_df = clean_df.dropna(subset=['Income'])

    return clean_df


def create_random_sample(clean_df, total_samples=100, random_seed=42):
    # Create tertiles and random sample
    clean_df['Tertile'] = pd.qcut(clean_df['Income'], q=3, labels=['Low', 'Medium', 'High'])

    np.random.seed(random_seed)
    n_low = 33 if (clean_df['Tertile'] == 'Low').sum() >= 33 else (clean_df['Tertile'] == 'Low').sum()
    n_medium = 33 if (clean_df['Tertile'] == 'Medium').sum() >= 33 else (clean_df['Tertile'] == 'Medium').sum()
    n_high = 34 if (clean_df['Tertile'] == 'High').sum() >= 34 else (clean_df['Tertile'] == 'High').sum()

    low_sample = clean_df[clean_df['Tertile'] == 'Low'].sample(n=n_low, random_state=random_seed)
    medium_sample = clean_df[clean_df['Tertile'] == 'Medium'].sample(n=n_medium, random_state=random_seed)
    high_sample = clean_df[clean_df['Tertile'] == 'High'].sample(n=n_high, random_state=random_seed)

    return pd.concat([low_sample, medium_sample, high_sample])


# Example usage
if __name__ == "__main__":
    # Define the directory path for ACS data
    dir_path = '/Users/phinnmarkson/Desktop/acs-data'
    # === Dynamically find ACS file in the folder ===
    acs_file = None
    for f in os.listdir(dir_path):
        if f.lower().endswith(".csv") and "acs" in f.lower():
            acs_file = os.path.join(dir_path, f)
            print(f"📂 Found ACS file: {acs_file}")
            # Run full RUCC + FIPS + USDA_Desert merge
            rucc_path = "/Users/phinnmarkson/PycharmProjects/acs/CopyofRUCC_StrataReady.csv"
            output_path = os.path.join(dir_path, "merged_rucc_fips_usda.csv")
            merge_rucc_with_fips_and_prepare_nanda(rucc_path, output_path)
            print(f"📁 Full RUCC + FIPS + USDA merge completed. Output saved to {output_path}")
            break