From f15b47ceeecef014c11a04e95cd91a3632207fa5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 23 Oct 2025 22:27:40 +0200 Subject: [PATCH 1/5] Fix file inconsistencies --- abbreviations.csv | 2 +- abbreviations.smi | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/abbreviations.csv b/abbreviations.csv index db7dcfa..9328bd2 100644 --- a/abbreviations.csv +++ b/abbreviations.csv @@ -7,7 +7,7 @@ NH2,N –OCH2O–,*OCO* s-Bu,*C(C)CC i-Pr,*C(C)C -2,6-(CH3)2C6H3,*C1=C(C)C=CC=C1(C) +"2,6-(CH3)2C6H3",*C1=C(C)C=CC=C1(C) PMB,*OCC1=CC=C(C=C1)OC Bpin,CC1(C)OB([*])OC1(C)C PMP,COC1=CC=C([*])C=C1 diff --git a/abbreviations.smi b/abbreviations.smi index 10b80e5..90a9078 100644 --- a/abbreviations.smi +++ b/abbreviations.smi @@ -1,14 +1,14 @@ -Abbreviation SMILES -C6H4F *c1ccc(F)cc1 -Mes *c1c(C)cc(C)cc1(C) -S S -N N -NH2 N -–OCH2O– *OCO* -s-Bu *C(C)CC -i-Pr *C(C)C -2,6-(CH3)2C6H3 *C1=C(C)C=CC=C1(C) -PMB *OCC1=CC=C(C=C1)OC +Abbreviation SMILES +C6H4F *c1ccc(F)cc1 +Mes *c1c(C)cc(C)cc1(C) +S S +N N +NH2 N +–OCH2O– *OCO* +s-Bu *C(C)CC +i-Pr *C(C)C +2,6-(CH3)2C6H3 *C1=C(C)C=CC=C1(C) +PMB *OCC1=CC=C(C=C1)OC Bpin CC1(C)OB([*])OC1(C)C PMP COC1=CC=C([*])C=C1 OPP O=P(OP(O[*])(O)=O)(O)O @@ -268,6 +268,6 @@ Ad [*]C12CC3CC(C2)CC(C3)C1 ADMB O=C(O[*])C(C)(C)CCOC(C)=O Adoc O=C([*])OC12CC3CC(C2)CC(C3)C1 Adpoc O=C([*])OC(C)(C12CC3CC(C2)CC(C3)C1)C -Alloc O=C([*])OCC=C +Alloc O=C([*])OCC=C AOC O=C([*])OCC=C -Bns O=S(CC1=CC=CC=C1)([O*])=O +Bns O=S(CC1=CC=CC=C1)([O*])=O From 78faa59b35fd29490780e426ac93ec2f34e6e5ed Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 23 Oct 2025 22:34:03 +0200 Subject: [PATCH 2/5] Update abbreviations.csv --- abbreviations.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/abbreviations.csv b/abbreviations.csv index 9328bd2..92792d2 100644 --- a/abbreviations.csv +++ b/abbreviations.csv @@ -134,7 +134,7 @@ O-n-C8H17,CCCCCCCCO[*] OBn,[*]OCC1=CC=CC=C1 OHC,O=C[*] PO3Bn2,O=P(OCC1=CC=CC=C1)([*])OCC2=CC=CC=C2 -Py,2 [*]C1=NC=CC=C1 +"Py 2",[*]C1=NC=CC=C1 SCE,[*]SCCC#N TBSO,C[Si](C(C)(C)C)(O[*])C decanyl,CCCCCCCCCC[*] From d0785d14f94e13edc7e4959b7680a8ca6164daeb Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 23 Oct 2025 22:39:06 +0200 Subject: [PATCH 3/5] Add missing entries in CSV --- abbreviations.csv | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/abbreviations.csv b/abbreviations.csv index 92792d2..83c15a1 100644 --- a/abbreviations.csv +++ b/abbreviations.csv @@ -257,3 +257,17 @@ SEt,CCS[*] SnBu3,[*][Sn](CCCC)(CCCC)CCCC TMSE,C[Si](C)(C)CC[*] n-Hexyl,CCCCCC[*] +AAM,O=C1C2=CC=CC=C2N[*]N1 +ABn,[N-]=[N+]=NC1=CC=C(C=C1)C[*] +ABO,[*]C1(O2)OCCC2CO1 +Ac,O=C([*])C +ACBZ,O=C([*])OCC1=CC=C(N=[N+]=[N-])C=C1 +AcHmb,COC1=CC=C(C(OC(C)=O)=C1)C[*] +Acm,CC(NC[*])=O +Ad,[*]C12CC3CC(C2)CC(C3)C1 +ADMB,O=C(O[*])C(C)(C)CCOC(C)=O +Adoc,O=C([*])OC12CC3CC(C2)CC(C3)C1 +Adpoc,O=C([*])OC(C)(C12CC3CC(C2)CC(C3)C1)C +Alloc,O=C([*])OCC=C +AOC,O=C([*])OCC=C +Bns,O=S(CC1=CC=CC=C1)([O*])=O From 0d3aeafa86c762701bd33f33621c103a0f50ae58 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 23 Oct 2025 22:40:42 +0200 Subject: [PATCH 4/5] Add linting script --- .github/workflows/check_abbreviations.yml | 5 +++ lint.py | 38 +++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 lint.py diff --git a/.github/workflows/check_abbreviations.yml b/.github/workflows/check_abbreviations.yml index b97a264..d6ef172 100644 --- a/.github/workflows/check_abbreviations.yml +++ b/.github/workflows/check_abbreviations.yml @@ -15,3 +15,8 @@ jobs: - uses: actions/checkout@v3 - name: Run duplicate check run: ./scripts/check_duplicates_abbreviations.sh + + - name: Install uv + uses: astral-sh/setup-uv@v3 + - name: Check CSV/SMI sync + run: uv run lint.py diff --git a/lint.py b/lint.py new file mode 100644 index 0000000..705d400 --- /dev/null +++ b/lint.py @@ -0,0 +1,38 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "pandas", +# ] +# /// + +from pathlib import Path + +import pandas as pd + +HERE = Path(__file__).parent.resolve() + + +def main(): + tsv_path = HERE.joinpath("abbreviations.csv") + df1 = pd.read_csv(tsv_path) + + smi_path = HERE.joinpath("abbreviations.smi") + df2 = pd.read_csv(smi_path, sep="\t") + + if not (df1.columns == df2.columns).all(): + raise + + s1 = set(df1['Abbreviation']) + s2 = set(df2['Abbreviation']) + + d1 = s1 - s2 + if d1: + raise ValueError(f"abbreviations in CSV but not SMI: {d1}") + + d2 = s2 - s1 + if d2: + raise ValueError(f"abbreviations in SMI but not CSV: {d2}") + + +if __name__ == '__main__': + main() From 4c85dedc6660d7dea1243a1c97ed3e6f7c1b2db0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 5 Nov 2025 15:40:59 +0100 Subject: [PATCH 5/5] Update lint.py --- lint.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/lint.py b/lint.py index 705d400..5641edc 100644 --- a/lint.py +++ b/lint.py @@ -13,25 +13,11 @@ def main(): - tsv_path = HERE.joinpath("abbreviations.csv") - df1 = pd.read_csv(tsv_path) - smi_path = HERE.joinpath("abbreviations.smi") df2 = pd.read_csv(smi_path, sep="\t") - if not (df1.columns == df2.columns).all(): - raise - - s1 = set(df1['Abbreviation']) - s2 = set(df2['Abbreviation']) - - d1 = s1 - s2 - if d1: - raise ValueError(f"abbreviations in CSV but not SMI: {d1}") - - d2 = s2 - s1 - if d2: - raise ValueError(f"abbreviations in SMI but not CSV: {d2}") + if smi_path.read_text() != df2.to_csv(sep='\t', index=False): + raise ValueError(f"{smi_path} is not formatted properly.") if __name__ == '__main__':