-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_sft_alignment_data.py
More file actions
183 lines (149 loc) · 7.33 KB
/
generate_sft_alignment_data.py
File metadata and controls
183 lines (149 loc) · 7.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
from src.util import add_jsonl_line
def produce_ny_dfs_summary(row) -> list[dict]:
"""Produce a summary prompt/response pair from NY DFS data."""
diagnosis = row["Diagnosis"]
treatment = row["Treatment"]
# coverage_type = row["Coverage Type"]
denial_reason = row["Denial Reason"]
gender = row["Gender"]
age_range = row["Age Range"]
# Construct one instance for each summary
examples = []
for col in list(row.index):
if "Summary" in col:
if not pd.isna(row[col]):
# Construct example of how the review came to be
response = row[col]
summary_example = {
"prompt": f"""Provide a summary of an insurance case adjudication outcome for an external appeal of an insurance plan's determination to deny services for {treatment} to treat {diagnosis} for a {gender} patient in the age range {age_range}. The denial was made on the basis of {denial_reason}""",
"response": f"""{response}""",
}
examples.append(summary_example)
# Construct example summarizing the review determination
recommended_action = None
if "overturn" in response:
recommended_action = "overturn"
if "uphold" in response or "upheld" in response:
recommended_action = "uphold"
if recommended_action:
outcome_example = {
"prompt": f"This is a health insurance case adjudication review from an independent reviewer:\n {response}\n What was the verdict of this reviewer?",
"response": f"The reviewer determined that the insurance plan should {recommended_action} its original denial.",
}
examples.append(outcome_example)
return examples
def produce_cms_qic_partc_summary(row) -> list[dict]:
"""Produce a summary prompt/response pair from CMS QIC appeals data."""
decision_rationale = row["decision_rationale"]
# appeal_type = row["appeal_type"]
condition = row["_condition"]
item_service = row.get("item_service", None)
part = row["part"]
if not item_service:
return None
# Construct one instance for each summary
if not pd.isna(condition) and not pd.isna(item_service):
summary_example = {
"prompt": f"""A Medicare part {part} beneficiary has been denied coverage by their plan for {item_service} services as part of their treatment for {condition}. A first level appeal was submitted, and the plan upheld the original denial. Provide a summary detailing your decision as to whether or not the Plan must cover these services under Medicare part {part} coverage rules.""",
"response": f"""{decision_rationale}""",
}
return summary_example
else:
return None
def produce_cms_qic_partd_summary(row) -> list[dict]:
"""Produce a summary prompt/response pair from CMS QIC appeals data."""
decision_rationale = row["decision_rationale"]
# appeal_type = row["appeal_type"]
condition = row["_condition"]
drug = row.get("drug", None)
part = row["part"]
if not drug:
return None
# Construct one instance for each summary
if not pd.isna(condition) and not pd.isna(drug):
summary_example = {
"prompt": f"""A Medicare part {part} beneficiary has been denied coverage by their plan for the drug {drug} as part of their treatment for {condition}. A first level appeal was submitted, and the plan upheld the original denial. Provide a summary detailing your decision as to whether or not the Plan must cover these services under Medicare part {part} coverage rules.""",
"response": f"""{decision_rationale}""",
}
return summary_example
else:
return None
def produce_ca_dmhc_summary(row) -> list[dict]:
"""Produce a summary prompt/response pair from CA DMHC data."""
diagnosis = row["DiagnosisCategory"]
subdiagnosis = row["DiagnosisSubCategory"]
treatment = row["TreatmentCategory"]
subtreatment = row["TreatmentSubCategory"]
denial_reason = row["Type"]
gender = row["PatientGender"]
age_range = row["AgeRange"]
findings = row["Findings"]
determination = row["Determination"]
examples = []
# Construct example of how the review came to be
summary_example = {
"prompt": f"""Provide a summary of an insurance case adjudication outcome for an external appeal of an insurance plan's determination to deny services for {treatment} ({subtreatment}) to treat {diagnosis} ({subdiagnosis}) for a {gender} patient in the age range {age_range}. The denial was made on the basis of {denial_reason}""",
"response": f"""{findings}""",
}
examples.append(summary_example)
# Construct example summarizing the review determination
recommended_action = None
if determination == "Overturned Decision of Health Plan":
recommended_action = "overturn"
if determination == "Upheld Decision of Health Plan":
recommended_action = "uphold"
if recommended_action:
outcome_example = {
"prompt": f"This is a health insurance case adjudication review from an independent reviewer:\n {findings}\n What was the verdict of this reviewer?",
"response": f"The reviewer determined that the insurance plan should {recommended_action} its original denial.",
}
examples.append(outcome_example)
return examples
if __name__ == "__main__":
########
# NY DFS
########
path = "./data/raw/ny_dfs/nydfs.xlsx"
ny_df = pd.read_excel(path)
examples = ny_df.apply(lambda row: produce_ny_dfs_summary(row), axis=1)
flattened_examples = [ex for sublist in examples for ex in sublist]
# Write lines
outfile = "./data/alignment/ny_dfs.jsonl"
for ex in flattened_examples:
add_jsonl_line(outfile, ex)
##############
# Medicare QIC
##############
part = "part_c"
path = f"./data/raw/medicare_qic/{part}.csv"
cms_df = pd.read_csv(path)
# Only consider rows with non-empty decision_rationales
cms_df = cms_df[~cms_df["decision_rationale"].isna()]
examples = cms_df.apply(lambda row: produce_cms_qic_partc_summary(row), axis=1)
# Write lines
outfile = f"./data/alignment/medicare_{part}_appeals.jsonl"
for ex in examples:
if ex:
add_jsonl_line(outfile, ex)
part = "part_d"
path = f"./data/raw/medicare_qic/{part}.csv"
cms_df = pd.read_csv(path)
# Only consider rows with non-empty decision_rationales
cms_df = cms_df[~cms_df["decision_rationale"].isna()]
examples = cms_df.apply(lambda row: produce_cms_qic_partd_summary(row), axis=1)
# Write lines
outfile = f"./data/alignment/medicare_{part}_appeals.jsonl"
for ex in examples:
if ex:
add_jsonl_line(outfile, ex)
##############
# CA DMHC
##############
path = "./data/raw/ca_dmhc/independent-medical-review-imr-determinations-trend.csv"
df = pd.read_csv(path, encoding="ISO-8859-1")
examples = df.apply(lambda row: produce_ca_dmhc_summary(row), axis=1)
flattened_examples = [ex for sublist in examples for ex in sublist]
outfile = "./data/alignment/ca_dmhc.jsonl"
for ex in flattened_examples:
add_jsonl_line(outfile, ex)