diff --git a/.gitignore b/.gitignore index b41f4b0..8325591 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +data/solvation/**/*.csv + + scripts/*.txt slurm.* .old diff --git a/data/solvation/README.md b/data/solvation/README.md new file mode 100644 index 0000000..6cd1b06 --- /dev/null +++ b/data/solvation/README.md @@ -0,0 +1,72 @@ +# Solvation Data + +Computed solvation free energies and enthalpies at 298.15 K for solute–solvent pairs, generated +by the COSMO-RS-based workflow described in the QuantumPioneer project paper. Each CSV file +corresponds to a single solvent (295 solvents total) and contains solvation properties for +every solute evaluated in that solvent. + +## Dataset Schemas + +### Closed-Shell and Open-Shell Species + +| Column | Description | +|--------------------|-----------------------------------------------------------------| +| `smiles` | Canonical SMILES of the solute | +| `Gsolv (kcal/mol)` | Solvation free energy of the solute in this solvent at 298.15 K | +| `Hsolv (kcal/mol)` | Solvation enthalpy of the solute in this solvent at 298.15 K | + +The two directories differ only in the type of solute: `closed_shell_species` contains +non-radical solutes, while `open_shell_species` contains radicals. + +### Transition States + +| Column | Description | +|------------------------------|----------------------------------------------------------------------------| +| `rxn_smiles` | Reaction SMILES (`r1.r2>>p1.p2`) | +| `Gsolv (kcal/mol)` | Solvation free energy of the transition state `ts` at 298.15 K | +| `r1_Gsolv` | Solvation free energy of reactant `r1` at 298.15 K | +| `r2_Gsolv` | Solvation free energy of reactant `r2` at 298.15 K | +| `p1_Gsolv` | Solvation free energy of product `p1` at 298.15 K | +| `p2_Gsolv` | Solvation free energy of product `p2` at 298.15 K | +| `DDGsolv_forward (kcal/mol)` | Solvation free energy of activation in the forward direction (`r1.r2>>ts`) | +| `DDGsolv_reverse (kcal/mol)` | Solvation free energy of activation in the reverse direction (`p1.p2>>ts`) | +| `Hsolv (kcal/mol)` | Solvation enthalpy of the transition state `ts` at 298.15 K | +| `r1_Hsolv` | Solvation enthalpy of reactant `r1` at 298.15 K | +| `r2_Hsolv` | Solvation enthalpy of reactant `r2` at 298.15 K | +| `p1_Hsolv` | Solvation enthalpy of product `p1` at 298.15 K | +| `p2_Hsolv` | Solvation enthalpy of product `p2` at 298.15 K | +| `DDHsolv_forward (kcal/mol)` | Solvation enthalpy of activation in the forward direction (`r1.r2>>ts`) | +| `DDHsolv_reverse (kcal/mol)` | Solvation enthalpy of activation in the reverse direction (`p1.p2>>ts`) | + +All energies are in kcal/mol. + +## Directory Structure + +``` +data/solvation/ +├── closed_shell_species/ +│ ├── a/ +│ │ ├── acetaldehyde.csv +│ │ ├── aceticacid.csv +│ │ └── ... +│ ├── b/ +│ └── ... +├── open_shell_species/ +│ ├── a/ +│ │ ├── acetaldehyde.csv +│ │ └── ... +│ ├── b/ +│ └── ... +├── transition_states/ +│ ├── a/ +│ │ ├── acetaldehyde.csv +│ │ └── ... +│ ├── b/ +│ └── ... +└── README.md +``` + +Within each top-level category the files are organized into subdirectories named after +the first alphabetical character of the solvent name (e.g. `a/`, `b/`, …). Each CSV +file is named `.csv`, where `` is the COSMO-RS solvent +identifier. diff --git a/scripts/solvation/solvents.csv b/scripts/solvation/solvents.csv new file mode 100644 index 0000000..9f9f6c8 --- /dev/null +++ b/scripts/solvation/solvents.csv @@ -0,0 +1,296 @@ +cosmo_name,smiles,inchi,cosmo_conf,source,exp_dielectric,T,source.1,polar,protic +"(1,1-dimethylethyl)benzene",CC(C)(C)c1ccccc1,"InChI=1/C10H14/c1-10(2,3)9-7-5-4-6-8-9/h4-8H,1-3H3",1,COSMObase,2.359,20.0,,0.0,0 +"1-(1,1-dimethylethoxy)-2-propanol",CC(O)COC(C)(C)C,"InChI=1/C7H16O2/c1-6(8)5-9-7(2,3)4/h6,8H,5H2,1-4H3",5,COSMObase,,,,,1 +1-bromonaphthalene,Brc1cccc2ccccc12,InChI=1/C10H7Br/c11-10-7-3-5-8-4-1-2-6-9(8)10/h1-7H,1,COSMObase,4.768,25.0,,0.0,0 +1-bromooctane,CCCCCCCCBr,"InChI=1/C8H17Br/c1-2-3-4-5-6-7-8-9/h2-8H2,1H3",4,COSMObase,5.0957,20.0,,0.0,0 +1-bromopropane,CCCBr,"InChI=1/C3H7Br/c1-2-3-4/h2-3H2,1H3",2,COSMObase,8.09,20.0,,0.0,0 +1-butanol,CCCCO,"InChI=1/C4H10O/c1-2-3-4-5/h5H,2-4H2,1H3",7,COSMOtherm,17.84,20.0,,0.0,1 +1-butylamine,CCCCN,"InChI=1/C4H11N/c1-2-3-4-5/h2-5H2,1H3",3,COSMOtherm,4.71,20.0,,0.0,1 +1-chloro-2-methyl-propane,CC(C)CCl,"InChI=1/C4H9Cl/c1-4(2)3-5/h4H,3H2,1-2H3",1,COSMObase,7.027,20.0,,0.0,0 +1-chlorobutane,CCCCCl,"InChI=1/C4H9Cl/c1-2-3-4-5/h2-4H2,1H3",1,COSMOtherm,7.276,20.0,,0.0,0 +1-chlorohexane,CCCCCCCl,"InChI=1/C6H13Cl/c1-2-3-4-5-6-7/h2-6H2,1H3",2,COSMOtherm,6.104,20.0,,0.0,0 +1-chloronaphthalene,Clc1cccc2ccccc12,InChI=1/C10H7Cl/c11-10-7-3-5-8-4-1-2-6-9(8)10/h1-7H,1,COSMObase,5.04,25.0,,0.0,0 +1-chloropropane,CCCCl,"InChI=1/C3H7Cl/c1-2-3-4/h2-3H2,1H3",1,COSMOtherm,8.588,20.0,,0.0,0 +1-decanol,CCCCCCCCCCO,"InChI=1/C10H22O/c1-2-3-4-5-6-7-8-9-10-11/h11H,2-10H2,1H3",3,COSMObase,7.93,20.0,,0.0,1 +1-decene,C=CCCCCCCCC,"InChI=1/C10H20/c1-3-5-7-9-10-8-6-4-2/h3H,1,4-10H2,2H3",4,COSMObase,2.136,20.0,,0.0,0 +1-ethoxybutane,CCCCOCC,"InChI=1/C6H14O/c1-3-5-6-7-4-2/h3-6H2,1-2H3",2,COSMObase,,,,,0 +1-fluorooctane,CCCCCCCCF,"InChI=1/C8H17F/c1-2-3-4-5-6-7-8-9/h2-8H2,1H3",4,COSMObase,3.89,20.0,,0.0,0 +1-heptanol,CCCCCCCO,"InChI=1/C7H16O/c1-2-3-4-5-6-7-8/h8H,2-7H2,1H3",8,COSMOtherm,11.75,20.0,,0.0,1 +1-hexadecene,C=CCCCCCCCCCCCCCC,"InChI=1/C16H32/c1-3-5-7-9-11-13-15-16-14-12-10-8-6-4-2/h3H,1,4-16H2,2H3",3,COSMObase,,,,,0 +1-hexanol,CCCCCCO,"InChI=1/C6H14O/c1-2-3-4-5-6-7/h7H,2-6H2,1H3",10,COSMOtherm,13.03,20.0,,0.0,1 +1-hexene,C=CCCCC,"InChI=1/C6H12/c1-3-5-6-4-2/h3H,1,4-6H2,2H3",1,COSMOtherm,2.077,21.0,,0.0,0 +1-methyl-4-isopropylbenzene,Cc1ccc(C(C)C)cc1,"InChI=1/C10H14/c1-8(2)10-6-4-9(3)5-7-10/h4-8H,1-3H3",1,COSMObase,,,,,0 +1-methyl-pyrrolidine,CN1CCCC1,"InChI=1/C5H11N/c1-6-4-2-3-5-6/h2-5H2,1H3",1,COSMObase,32.2,25.0,,1.0,0 +1-methylnaphthalene,Cc1cccc2ccccc12,"InChI=1/C11H10/c1-9-5-4-7-10-6-2-3-8-11(9)10/h2-8H,1H3",1,COSMObase,2.915,20.0,,0.0,0 +1-methylpiperidin-2-one,CN1CCCCC1=O,"InChI=1/C6H11NO/c1-7-5-3-2-4-6(7)8/h2-5H2,1H3",1,COSMObase,,,,,0 +1-nitropropane,CCC[N+](=O)[O-],"InChI=1/C3H7NO2/c1-2-3-4(5)6/h2-3H2,1H3",3,COSMObase,24.7,15.0,,1.0,0 +1-nonanol,CCCCCCCCCO,"InChI=1/C9H20O/c1-2-3-4-5-6-7-8-9-10/h10H,2-9H2,1H3",7,COSMObase,8.83,20.0,,0.0,1 +1-nonene,C=CCCCCCCC,"InChI=1/C9H18/c1-3-5-7-9-8-6-4-2/h3H,1,4-9H2,2H3",3,COSMObase,2.18,20.0,,0.0,0 +1-octanol,CCCCCCCCO,"InChI=1/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",7,COSMOtherm,10.3,20.0,,0.0,1 +1-pentanol,CCCCCO,"InChI=1/C5H12O/c1-2-3-4-5-6/h6H,2-5H2,1H3",8,COSMOtherm,15.13,25.0,,0.0,1 +1-propoxy-2-propanol,CCCOCC(C)O,"InChI=1/C6H14O2/c1-3-4-8-5-6(2)7/h6-7H,3-5H2,1-2H3",14,COSMObase,,,,,1 +1-undecanol,CCCCCCCCCCCO,"InChI=1/C11H24O/c1-2-3-4-5-6-7-8-9-10-11-12/h12H,2-11H2,1H3",4,COSMObase,5.98,40.0,,0.0,1 +"1,1-dibromoethane",CC(Br)Br,"InChI=1/C2H4Br2/c1-2(3)4/h2H,1H3",1,COSMObase,4.9612,20.0,,0.0,0 +"1,1-dichloroethane",CC(Cl)Cl,"InChI=1/C2H4Cl2/c1-2(3)4/h2H,1H3",1,COSMOtherm,10.1,25.0,,0.0,0 +"1,1,1-trichloroethane",CC(Cl)(Cl)Cl,"InChI=1/C2H3Cl3/c1-2(3,4)5/h1H3",1,COSMOtherm,7.243,20.0,,0.0,0 +"1,1,2,2-tetrabromoethane",BrC(Br)C(Br)Br,InChI=1/C2H2Br4/c3-1(4)2(5)6/h1-2H,2,COSMObase,6.72,30.0,,0.0,0 +"1,1,2,2-tetrachloroethane",ClC(Cl)C(Cl)Cl,InChI=1/C2H2Cl4/c3-1(4)2(5)6/h1-2H,3,COSMObase,8.5,20.0,,0.0,0 +"1,1'-oxybis-2-ethoxy-ethane",CCOCCOCCOCC,"InChI=1/C8H18O3/c1-3-9-5-7-11-8-6-10-4-2/h3-8H2,1-2H3",16,COSMObase,,,,,0 +"1,2-benzenedicarboxylicaciddinonylester",CCCCCCCCCOC(=O)c1ccccc1C(=O)OCCCCCCCCC,"InChI=1/C26H42O4/c1-3-5-7-9-11-13-17-21-29-25(27)23-19-15-16-20-24(23)26(28)30-22-18-14-12-10-8-6-4-2/h15-16,19-20H,3-14,17-18,21-22H2,1-2H3",10,COSMObase,,,,,0 +"1,2-butanediol",CCC(O)CO,"InChI=1/C4H10O2/c1-2-4(6)3-5/h4-6H,2-3H2,1H3",13,COSMObase,22.4,25.0,,1.0,1 +"1,2-diaminoethane",NCCN,InChI=1/C2H8N2/c3-1-2-4/h1-4H2,8,COSMObase,,,,,1 +"1,2-dibromoethane",BrCCBr,InChI=1/C2H4Br2/c3-1-2-4/h1-2H2,2,COSMOtherm,4.9612,20.0,,0.0,0 +"1,2-dichlorobenzene",Clc1ccccc1Cl,InChI=1/C6H4Cl2/c7-5-3-1-2-4-6(5)8/h1-4H,1,COSMOtherm,10.12,20.0,,0.0,0 +"1,2-dichloroethane",ClCCCl,InChI=1/C2H4Cl2/c3-1-2-4/h1-2H2,2,COSMOtherm,10.42,20.0,,0.0,0 +"1,2-dimethoxyethane",COCCOC,"InChI=1/C4H10O2/c1-5-3-4-6-2/h3-4H2,1-2H3",4,COSMObase,7.3,24.0,,0.0,0 +"1,2-dimethylbenzene",Cc1ccccc1C,"InChI=1/C8H10/c1-7-5-3-4-6-8(7)2/h3-6H,1-2H3",1,COSMOtherm,2.562,20.0,,0.0,0 +"1,2-epoxypropane",CC1CO1,"InChI=1/C3H6O/c1-3-2-4-3/h3H,2H2,1H3",1,COSMObase,,,,,0 +"1,2,4-trimethylbenzene",Cc1ccc(C)c(C)c1,"InChI=1/C9H12/c1-7-4-5-8(2)9(3)6-7/h4-6H,1-3H3",1,COSMOtherm,2.377,20.0,,0.0,0 +"1,3-butanediol",CC(O)CCO,"InChI=1/C4H10O2/c1-4(6)2-3-5/h4-6H,2-3H2,1H3",19,COSMObase,28.8,25.0,,1.0,1 +"1,3-dimethyl-2-imidazolidinone",CN1CCN(C)C1=O,"InChI=1/C5H10N2O/c1-6-3-4-7(2)5(6)8/h3-4H2,1-2H3",1,COSMObase,,,,,0 +"1,3-dimethylbenzene",Cc1cccc(C)c1,"InChI=1/C8H10/c1-7-4-3-5-8(2)6-7/h3-6H,1-2H3",1,COSMOtherm,2.359,20.0,,0.0,0 +"1,3-propanediamine",NCCCN,InChI=1/C3H10N2/c4-2-1-3-5/h1-5H2,8,COSMObase,,,,,1 +"1,3-propanediol",OCCCO,"InChI=1/C3H8O2/c4-2-1-3-5/h4-5H,1-3H2",18,COSMObase,35.1,20.0,,1.0,1 +"1,3,5-trimethylbenzene",Cc1cc(C)cc(C)c1,"InChI=1/C9H12/c1-7-4-8(2)6-9(3)5-7/h4-6H,1-3H3",1,COSMObase,2.279,20.0,,0.0,0 +"1,4-butadiol",OCCCCO,"InChI=1/C4H10O2/c5-3-1-2-4-6/h5-6H,1-4H2",22,COSMObase,31.9,25.0,,1.0,1 +"1,4-dimethylbenzene",Cc1ccc(C)cc1,"InChI=1/C8H10/c1-7-3-5-8(2)6-4-7/h3-6H,1-2H3",1,COSMOtherm,2.2735,20.0,,0.0,0 +"1,5-pentanediol",OCCCCCO,"InChI=1/C5H12O2/c6-4-2-1-3-5-7/h6-7H,1-5H2",11,COSMObase,26.2,20.0,,1.0,1 +"1,9-decadiene",C=CCCCCCCC=C,"InChI=1/C10H18/c1-3-5-7-9-10-8-6-4-2/h3-4H,1-2,5-10H2",3,COSMObase,,,,,0 +1h-indene,C1=Cc2ccccc2C1,"InChI=1/C9H8/c1-2-5-9-7-3-6-8(9)4-1/h1-6H,7H2",1,COSMObase,,,,,0 +2-(2-aminoethoxy)ethanol,NCCOCCO,"InChI=1/C4H11NO2/c5-1-3-7-4-2-6/h6H,1-5H2",19,COSMObase,,,,,1 +2-(2-methoxyethoxy)ethanol,COCCOCCO,"InChI=1/C5H12O3/c1-7-4-5-8-3-2-6/h6H,2-5H2,1H3",18,COSMObase,,,,,1 +2-2-(2-butoxyethoxy)ethoxyethanol,CCCCOCCOCCOCCO,"InChI=1/C10H22O4/c1-2-3-5-12-7-9-14-10-8-13-6-4-11/h11H,2-10H2,1H3",33,COSMObase,,,,,1 +2-2-(2-methoxyethoxy)ethoxyethanol,COCCOCCOCCO,"InChI=1/C7H16O4/c1-9-4-5-11-7-6-10-3-2-8/h8H,2-7H2,1H3",26,COSMObase,,,,,1 +2-butanol,CCC(C)O,"InChI=1/C4H10O/c1-3-4(2)5/h4-5H,3H2,1-2H3",4,COSMOtherm,17.26,20.0,,0.0,1 +2-butoxyethanol,CCCCOCCO,"InChI=1/C6H14O2/c1-2-3-5-8-6-4-7/h7H,2-6H2,1H3",13,COSMObase,,,,,1 +2-butylbenzene,CCC(C)c1ccccc1,"InChI=1/C10H14/c1-3-9(2)10-7-5-4-6-8-10/h4-9H,3H2,1-2H3",1,COSMObase,2.357,20.0,,0.0,0 +2-chloro-2-methylpropane,CC(C)(C)Cl,"InChI=1/C4H9Cl/c1-4(2,3)5/h1-3H3",1,COSMObase,9.663,20.0,,0.0,0 +2-chlorobutane,CCC(C)Cl,"InChI=1/C4H9Cl/c1-3-4(2)5/h4H,3H2,1-2H3",1,COSMObase,8.564,20.0,,0.0,0 +2-chloroethanol,OCCCl,"InChI=1/C2H5ClO/c3-1-2-4/h4H,1-2H2",4,COSMObase,25.8,20.0,,1.0,1 +2-ethoxy-2-methyl-propane,CCOC(C)(C)C,"InChI=1/C6H14O/c1-5-7-6(2,3)4/h5H2,1-4H3",1,COSMObase,,,,,0 +2-ethyl-1-hexanol,CCCCC(CC)CO,"InChI=1/C8H18O/c1-3-5-6-8(4-2)7-9/h8-9H,3-7H2,1-2H3",10,COSMObase,,,,,1 +2-furanmethanol,OCc1ccco1,"InChI=1/C5H6O2/c6-4-5-2-1-3-7-5/h1-3,6H,4H2",3,COSMObase,16.85,25.0,,0.0,1 +2-heptanone,CCCCCC(C)=O,"InChI=1/C7H14O/c1-3-4-5-6-7(2)8/h3-6H2,1-2H3",3,COSMObase,11.95,20.0,,0.0,0 +2-hexanone,CCCCC(C)=O,"InChI=1/C6H12O/c1-3-4-5-6(2)7/h3-5H2,1-2H3",1,COSMOtherm,14.56,20.0,,0.0,0 +2-isopropoxyethanol,CC(C)OCCO,"InChI=1/C5H12O2/c1-5(2)7-4-3-6/h5-6H,3-4H2,1-2H3",9,COSMObase,,,,,1 +2-mercaptoethanol,OCCS,"InChI=1/C2H6OS/c3-1-2-4/h3-4H,1-2H2",7,COSMObase,,,,,1 +2-methoxyethanol,COCCO,"InChI=1/C3H8O2/c1-5-3-2-4/h4H,2-3H2,1H3",8,COSMOtherm,17.2,20.0,,0.0,1 +2-methyl-1-butanol,CCC(C)CO,"InChI=1/C5H12O/c1-3-5(2)4-6/h5-6H,3-4H2,1-2H3",7,COSMOtherm,15.63,20.0,,0.0,1 +2-methyl-1-pentanol,CCCC(C)CO,"InChI=1/C6H14O/c1-3-4-6(2)5-7/h6-7H,3-5H2,1-2H3",5,COSMObase,,,,,1 +2-methyl-2-butanol,CCC(C)(C)O,"InChI=1/C5H12O/c1-4-5(2,3)6/h6H,4H2,1-3H3",5,COSMObase,5.78,25.0,,0.0,1 +2-methylpentane,CCCC(C)C,"InChI=1/C6H14/c1-4-5-6(2)3/h6H,4-5H2,1-3H3",1,COSMOtherm,1.886,20.0,,0.0,0 +2-methylpyridine,Cc1ccccn1,"InChI=1/C6H7N/c1-6-4-2-3-5-7-6/h2-5H,1H3",1,COSMOtherm,10.18,20.0,,0.0,0 +2-nitropropane,CC(C)[N+](=O)[O-],"InChI=1/C3H7NO2/c1-3(2)4(5)6/h3H,1-2H3",1,COSMOtherm,26.74,15.0,,1.0,0 +2-nitrotoluene,Cc1ccccc1[N+](=O)[O-],"InChI=1/C7H7NO2/c1-6-4-2-3-5-7(6)8(9)10/h2-5H,1H3",2,COSMObase,26.26,20.0,,1.0,0 +2-pentanol,CCCC(C)O,"InChI=1/C5H12O/c1-3-4-5(2)6/h5-6H,3-4H2,1-2H3",4,COSMObase,13.71,25.0,,0.0,1 +2-pentanone,CCCC(C)=O,"InChI=1/C5H10O/c1-3-4-5(2)6/h3-4H2,1-2H3",2,COSMOtherm,15.45,20.0,,0.0,0 +2-phenylethanol,OCCc1ccccc1,"InChI=1/C8H10O/c9-7-6-8-4-2-1-3-5-8/h1-5,9H,6-7H2",5,COSMObase,12.31,20.0,,0.0,1 +2-propanol,CC(C)O,"InChI=1/C3H8O/c1-3(2)4/h3-4H,1-2H3",2,COSMOtherm,20.18,20.0,,1.0,1 +2-propoxyethanol,CCCOCCO,"InChI=1/C5H12O2/c1-2-4-7-5-3-6/h6H,2-5H2,1H3",13,COSMObase,,,,,1 +"2-pyrrolidinone,1-ethyl",CCN1CCCC1=O,"InChI=1/C6H11NO/c1-2-7-5-3-4-6(7)8/h2-5H2,1H3",2,COSMObase,,,,,0 +"2,2-dimethylbutane",CCC(C)(C)C,"InChI=1/C6H14/c1-5-6(2,3)4/h5H2,1-4H3",1,COSMObase,1.869,20.0,,0.0,0 +"2,2,4-trimethylpentane",CC(C)CC(C)(C)C,"InChI=1/C8H18/c1-7(2)6-8(3,4)5/h7H,6H2,1-5H3",1,COSMObase,1.943,20.0,,0.0,0 +"2,2,4,4,6,8,8-heptamethylnonane",CC(CC(C)(C)C)CC(C)(C)CC(C)(C)C,"InChI=1/C16H34/c1-13(10-14(2,3)4)11-16(8,9)12-15(5,6)7/h13H,10-12H2,1-9H3",4,COSMObase,,,,,0 +"2,2,5-trimethylhexane",CC(C)CCC(C)(C)C,"InChI=1/C9H20/c1-8(2)6-7-9(3,4)5/h8H,6-7H2,1-5H3",1,COSMObase,,,,,0 +"2,2'-(methylimino)bis-ethanol",CN(CCO)CCO,"InChI=1/C5H13NO2/c1-6(2-4-7)3-5-8/h7-8H,2-5H2,1H3",33,COSMObase,,,,,1 +"2,2'-oxybis(2,1-ethanediyloxy)bisethanol",OCCOCCOCCOCCO,"InChI=1/C8H18O5/c9-1-3-11-5-7-13-8-6-12-4-2-10/h9-10H,1-8H2",15,COSMObase,20.44,20.0,,1.0,1 +"2,2'-thiobisethanol",OCCSCCO,"InChI=1/C4H10O2S/c5-1-3-7-4-2-6/h5-6H,1-4H2",22,COSMObase,,,,,1 +"2,3-butanediol",CC(O)C(C)O,"InChI=1/C4H10O2/c1-3(5)4(2)6/h3-6H,1-2H3",12,COSMObase,30.0,25.0,est,1.0,1 +"2,3-dimethylbutane",CC(C)C(C)C,"InChI=1/C6H14/c1-5(2)6(3)4/h5-6H,1-4H3",2,COSMObase,1.889,20.0,,0.0,0 +"2,3-dimethylpyridine",Cc1cccnc1C,"InChI=1/C7H9N/c1-6-4-3-5-8-7(6)2/h3-5H,1-2H3",1,COSMOtherm,8.0,20.0,est,0.0,0 +"2,3,4-trimethylpentane",CC(C)C(C)C(C)C,"InChI=1/C8H18/c1-6(2)8(5)7(3)4/h6-8H,1-5H3",1,COSMObase,1.9738,20.0,,0.0,0 +"2,4-dimethylpentane",CC(C)CC(C)C,"InChI=1/C7H16/c1-6(2)5-7(3)4/h6-7H,5H2,1-4H3",1,COSMObase,1.902,20.0,,0.0,0 +"2,4,4-trimethyl-1-pentene",C=C(C)CC(C)(C)C,"InChI=1/C8H16/c1-7(2)6-8(3,4)5/h1,6H2,2-5H3",1,COSMObase,2.0908,25.0,,0.0,0 +"2,5,8,11-tetraoxadodecane",COCCOCCOCCOC,"InChI=1/C8H18O4/c1-9-3-5-11-7-8-12-6-4-10-2/h3-8H2,1-2H3",29,COSMObase,7.62,25.0,,0.0,0 +"2,5,8,11,14-pentaoxapentadecane",COCCOCCOCCOCCOC,"InChI=1/C10H22O5/c1-11-3-5-13-7-9-15-10-8-14-6-4-12-2/h3-10H2,1-2H3",20,COSMObase,,,,,0 +"2,6-dimethylpyridine",Cc1cccc(C)n1,"InChI=1/C7H9N/c1-6-4-3-5-7(2)8-6/h3-5H,1-2H3",1,COSMObase,2.653,20.0,,0.0,0 +"2,6,10,15,19,23-hexamethyltetracosane",CC(C)CCCC(C)CCCC(C)CCCCC(C)CCCC(C)CCCC(C)C,"InChI=1/C30H62/c1-25(2)15-11-19-29(7)23-13-21-27(5)17-9-10-18-28(6)22-14-24-30(8)20-12-16-26(3)4/h25-30H,9-24H2,1-8H3",3,COSMObase,,,,,0 +3-dimethylaminopropylamine,CN(C)CCCN,"InChI=1/C5H14N2/c1-7(2)5-3-4-6/h3-6H2,1-2H3",11,COSMObase,,,,,1 +3-heptanone,CCCCC(=O)CC,"InChI=1/C7H14O/c1-3-5-6-7(8)4-2/h3-6H2,1-2H3",2,COSMObase,12.7,20.0,,0.0,0 +3-hydroxytoluene,Cc1cccc(O)c1,"InChI=1/C7H8O/c1-6-3-2-4-7(8)5-6/h2-5,8H,1H3",1,COSMOtherm,12.44,25.0,,0.0,1 +3-methoxy-1-butanol,COC(C)CCO,"InChI=1/C5H12O2/c1-5(7-2)3-4-6/h5-6H,3-4H2,1-2H3",12,COSMObase,,,,,1 +3-methyl-hexane,CCCC(C)CC,"InChI=1/C7H16/c1-4-6-7(3)5-2/h7H,4-6H2,1-3H3",1,COSMObase,1.92,20.0,,0.0,0 +"3-methyl-tetrahydrothiophene-1,1-dioxide",CC1CCS(=O)(=O)C1,"InChI=1/C5H10O2S/c1-5-2-3-8(6,7)4-5/h5H,2-4H2,1H3",1,COSMObase,29.4,25.0,,1.0,0 +3-methylpentane,CCC(C)CC,"InChI=1/C6H14/c1-4-6(3)5-2/h6H,4-5H2,1-3H3",1,COSMOtherm,1.886,20.0,,0.0,0 +3-pentanol,CCC(O)CC,"InChI=1/C5H12O/c1-3-5(6)4-2/h5-6H,3-4H2,1-2H3",5,COSMOtherm,13.35,25.0,,0.0,1 +"3,3'-oxybis-propanenitrile",N#CCCOCCC#N,"InChI=1/C6H8N2O/c7-3-1-5-9-6-2-4-8/h1-2,5-6H2",6,COSMObase,,,,,0 +"3,7-dimethyl-1-octanol",CC(C)CCCC(C)CCO,"InChI=1/C10H22O/c1-9(2)5-4-6-10(3)7-8-11/h9-11H,4-8H2,1-3H3",7,COSMObase,,,,,1 +4-methyl-2-pentanol,CC(C)CC(C)O,"InChI=1/C6H14O/c1-5(2)4-6(3)7/h5-7H,4H2,1-3H3",4,COSMObase,,,,,1 +4-methyl-2-pentanone,CC(=O)CC(C)C,"InChI=1/C6H12O/c1-5(2)4-6(3)7/h5H,4H2,1-3H3",3,COSMOtherm,13.11,20.0,,0.0,0 +acetaldehyde,CC=O,"InChI=1/C2H4O/c1-2-3/h2H,1H3",1,COSMOtherm,21.0,18.0,,1.0,0 +aceticacid,CC(=O)O,"InChI=1/C2H4O2/c1-2(3)4/h1H3,(H,3,4)/f/h3H",2,COSMOtherm,6.2,20.0,,0.0,1 +aceticacid-2-methylpropylester,CC(=O)OCC(C)C,"InChI=1/C6H12O2/c1-5(2)4-8-6(3)7/h5H,4H2,1-3H3",4,COSMObase,6.0,20.0,,0.0,0 +aceticacidphenylmethylester,CC(=O)OCc1ccccc1,"InChI=1/C9H10O2/c1-8(10)11-7-9-5-3-2-4-6-9/h2-6H,7H2,1H3",4,COSMObase,5.34,30.0,,0.0,0 +acetonitrile,CC#N,InChI=1/C2H3N/c1-2-3/h1H3,1,COSMOtherm,36.64,20.0,,1.0,0 +acetophenone,CC(=O)c1ccccc1,"InChI=1/C8H8O/c1-7(9)8-5-3-2-4-6-8/h2-6H,1H3",1,COSMOtherm,17.44,20.0,,0.0,0 +acetylacetone,CC(=O)CC(C)=O,"InChI=1/C5H8O2/c1-4(6)3-5(2)7/h3H2,1-2H3",4,COSMObase,26.524,30.0,,1.0,0 +aniline,Nc1ccccc1,"InChI=1/C6H7N/c7-6-4-2-1-3-5-6/h1-5H,7H2",1,COSMOtherm,7.06,20.0,,0.0,1 +anisole,COc1ccccc1,"InChI=1/C7H8O/c1-8-7-5-3-2-4-6-7/h2-6H,1H3",1,COSMOtherm,4.3,21.0,,0.0,0 +bcl3,ClB(Cl)Cl,InChI=1/BCl3/c2-1(3)4,1,COSMObase,,,,,0 +benzene,c1ccccc1,InChI=1/C6H6/c1-2-4-6-5-3-1/h1-6H,1,COSMOtherm,,2.2825,20,,0 +benzonitrile,N#Cc1ccccc1,InChI=1/C7H5N/c8-6-7-4-2-1-3-5-7/h1-5H,1,COSMOtherm,25.9,20.0,,1.0,0 +benzylalcohol,OCc1ccccc1,"InChI=1/C7H8O/c8-6-7-4-2-1-3-5-7/h1-5,8H,6H2",4,COSMOtherm,11.916,30.0,,0.0,1 +benzylamine,NCc1ccccc1,"InChI=1/C7H9N/c8-6-7-4-2-1-3-5-7/h1-5H,6,8H2",3,COSMOtherm,5.18,20.0,,0.0,1 +bicyclohexyl,C1CCC(C2CCCCC2)CC1,"InChI=1/C12H22/c1-3-7-11(8-4-1)12-9-5-2-6-10-12/h11-12H,1-10H2",2,COSMObase,,,,,0 +bromobenzene,Brc1ccccc1,InChI=1/C6H5Br/c7-6-4-2-1-3-5-6/h1-5H,1,COSMOtherm,5.45,20.0,,0.0,0 +bromoethane,CCBr,"InChI=1/C2H5Br/c1-2-3/h2H2,1H3",1,COSMOtherm,9.01,25.0,,0.0,0 +butanal,CCCC=O,"InChI=1/C4H8O/c1-2-3-4-5/h4H,2-3H2,1H3",2,COSMObase,13.45,25.0,,0.0,0 +butane,CCCC,"InChI=1/C4H10/c1-3-4-2/h3-4H2,1-2H3",1,COSMOtherm,1.79697,22.0,,0.0,0 +butanone,CCC(C)=O,"InChI=1/C4H8O/c1-3-4(2)5/h3H2,1-2H3",1,COSMOtherm,18.56,20.0,,0.0,0 +butoxide,CCC1CO1,"InChI=1/C4H8O/c1-2-4-3-5-4/h4H,2-3H2,1H3",2,COSMObase,,,,,0 +butylbenzene,CCCCc1ccccc1,"InChI=1/C10H14/c1-2-3-7-10-8-5-4-6-9-10/h4-6,8-9H,2-3,7H2,1H3",1,COSMObase,2.359,20.0,,0.0,0 +butyronitrile,CCCC#N,"InChI=1/C4H7N/c1-2-3-4-5/h2-3H2,1H3",1,COSMOtherm,24.83,20.0,,1.0,0 +carbonicdichloride,O=C(Cl)Cl,InChI=1/CCl2O/c2-1(3)4,1,COSMObase,4.3,22.0,,0.0,0 +ccl4,ClC(Cl)(Cl)Cl,"InChI=1/CCl4/c2-1(3,4)5",1,COSMOtherm,2.2379,20.0,,0.0,0 +ch2cl2,ClCCl,InChI=1/CH2Cl2/c2-1-3/h1H2,1,COSMOtherm,8.93,25.0,,0.0,0 +ch2clbr,ClCBr,InChI=1/CH2BrCl/c2-1-3/h1H2,1,COSMObase,,,,,0 +ch3i,CI,InChI=1/CH3I/c1-2/h1H3,1,COSMOtherm,6.97,20.0,,0.0,0 +chbr3,BrC(Br)Br,InChI=1/CHBr3/c2-1(3)4/h1H,1,COSMOtherm,4.404,20.0,,0.0,0 +chcl3,ClC(Cl)Cl,InChI=1/CHCl3/c2-1(3)4/h1H,1,COSMOtherm,4.8069,20.0,,0.0,0 +chinoline,c1ccc2ncccc2c1,InChI=1/C9H7N/c1-2-6-9-8(4-1)5-3-7-10-9/h1-7H,1,COSMOtherm,9.16,20.0,,0.0,0 +chlorobenzene,Clc1ccccc1,InChI=1/C6H5Cl/c7-6-4-2-1-3-5-6/h1-5H,1,COSMOtherm,5.6895,20.0,,0.0,0 +chlorocyclohexane,ClC1CCCCC1,"InChI=1/C6H11Cl/c7-6-4-2-1-3-5-6/h6H,1-5H2",1,COSMObase,7.9505,30.0,,0.0,0 +"cis-1,2-dichloroethene",ClC=CCl,InChI=1/C2H2Cl2/c3-1-2-4/h1-2H/b2-1-,1,COSMOtherm,9.2,25.0,,0.0,0 +cs2,S=C=S,InChI=1/CS2/c2-1-3,1,COSMOtherm,2.632,20.0,,0.0,0 +cyanoaceticacidmethylester,COC(=O)CC#N,"InChI=1/C4H5NO2/c1-7-4(6)2-3-5/h2H2,1H3",2,COSMObase,,,,,0 +cyclohexane,C1CCCCC1,InChI=1/C6H12/c1-2-4-6-5-3-1/h1-6H2,1,COSMOtherm,2.0243,20.0,,0.0,0 +cyclohexanol,OC1CCCCC1,"InChI=1/C6H12O/c7-6-4-2-1-3-5-6/h6-7H,1-5H2",3,COSMOtherm,16.4,20.0,,0.0,1 +cyclohexanone,O=C1CCCCC1,InChI=1/C6H10O/c7-6-4-2-1-3-5-6/h1-5H2,1,COSMOtherm,16.1,20.0,,0.0,0 +cyclohexene,C1=CCCCC1,"InChI=1/C6H10/c1-2-4-6-5-3-1/h1-2H,3-6H2",1,COSMOtherm,2.2176,20.0,,0.0,0 +cyclohexylbenzene,c1ccc(C2CCCCC2)cc1,"InChI=1/C12H16/c1-3-7-11(8-4-1)12-9-5-2-6-10-12/h1,3-4,7-8,12H,2,5-6,9-10H2",1,COSMObase,,,,,0 +cyclopentane,C1CCCC1,InChI=1/C5H10/c1-2-4-5-3-1/h1-5H2,1,COSMOtherm,1.9687,20.0,,0.0,0 +cyclopentanone,O=C1CCCC1,InChI=1/C5H8O/c6-5-3-1-2-4-5/h1-4H2,1,COSMOtherm,13.58,25.0,,0.0,0 +di-2-ethylhexylphthalate,CCCCC(CC)COC(=O)c1ccccc1C(=O)OCC(CC)CCCC,"InChI=1/C24H38O4/c1-5-9-13-19(7-3)17-27-23(25)21-15-11-12-16-22(21)24(26)28-18-20(8-4)14-10-6-2/h11-12,15-16,19-20H,5-10,13-14,17-18H2,1-4H3",20,COSMObase,,,,,0 +di-n-butylether,CCCCOCCCC,"InChI=1/C8H18O/c1-3-5-7-9-8-6-4-2/h3-8H2,1-2H3",6,COSMOtherm,3.083,20.0,,0.0,0 +di-n-propylether,CCCOCCC,"InChI=1/C6H14O/c1-3-5-7-6-4-2/h3-6H2,1-2H3",4,COSMOtherm,3.38,24.0,,0.0,0 +dibenzylether,c1ccc(COCc2ccccc2)cc1,"InChI=1/C14H14O/c1-3-7-13(8-4-1)11-15-12-14-9-5-2-6-10-14/h1-10H,11-12H2",7,COSMObase,3.821,20.0,,0.0,0 +dibutylamine,CCCCNCCCC,"InChI=1/C8H19N/c1-3-5-7-9-8-6-4-2/h9H,3-8H2,1-2H3",6,COSMOtherm,2.765,20.0,,0.0,1 +dibutylphthalate,CCCCOC(=O)c1ccccc1C(=O)OCCCC,"InChI=1/C16H22O4/c1-3-5-11-19-15(17)13-9-7-8-10-14(13)16(18)20-12-6-4-2/h7-10H,3-6,11-12H2,1-2H3",12,COSMObase,6.58,20.0,,0.0,0 +dichloroaceticacid,O=C(O)C(Cl)Cl,"InChI=1/C2H2Cl2O2/c3-1(4)2(5)6/h1H,(H,5,6)/f/h5H",4,COSMObase,8.33,20.0,,0.0,1 +diethylacetamide,CCN(CC)C(C)=O,"InChI=1/C6H13NO/c1-4-7(5-2)6(3)8/h4-5H2,1-3H3",1,COSMObase,32.1,20.0,,1.0,0 +diethyleneglycol,OCCOCCO,"InChI=1/C4H10O3/c5-1-3-7-4-2-6/h5-6H,1-4H2",26,COSMObase,31.82,20.0,,1.0,1 +diethyleneglycolmonobutylether,CCCCOCCOCCO,"InChI=1/C8H18O3/c1-2-3-5-10-7-8-11-6-4-9/h9H,2-8H2,1H3",26,COSMObase,,,,,1 +diethyleneglycolmonoethylether,CCOCCOCCO,"InChI=1/C6H14O3/c1-2-8-5-6-9-4-3-7/h7H,2-6H2,1H3",21,COSMObase,,,,,1 +diethylether,CCOCC,"InChI=1/C4H10O/c1-3-5-4-2/h3-4H2,1-2H3",3,COSMOtherm,4.2666,20.0,,0.0,0 +diethylphthalate,CCOC(=O)c1ccccc1C(=O)OCC,"InChI=1/C12H14O4/c1-3-15-11(13)9-7-5-6-8-10(9)12(14)16-4-2/h5-8H,3-4H2,1-2H3",15,COSMObase,7.86,20.0,,0.0,0 +diglyme,COCCOCCOC,"InChI=1/C6H14O3/c1-7-3-5-9-6-4-8-2/h3-6H2,1-2H3",18,COSMObase,7.23,25.0,,0.0,0 +diiodomethane,ICI,InChI=1/CH2I2/c2-1-3/h1H2,1,COSMObase,5.32,25.0,,0.0,0 +diisopropylether,CC(C)OC(C)C,"InChI=1/C6H14O/c1-5(2)7-6(3)4/h5-6H,1-4H3",3,COSMOtherm,3.805,30.0,,0.0,0 +dimethoxymethane,COCOC,"InChI=1/C3H8O2/c1-4-3-5-2/h3H2,1-2H3",1,COSMObase,2.644,20.0,,0.0,0 +dimethylcarbonate,COC(=O)OC,InChI=1/C3H6O3/c1-5-3(4)6-2/h1-2H3,2,COSMObase,3.087,25.0,,0.0,0 +dimethylformamide,CN(C)C=O,"InChI=1/C3H7NO/c1-4(2)3-5/h3H,1-2H3",1,COSMOtherm,38.25,20.0,,1.0,0 +dimethylsulfide,CSC,InChI=1/C2H6S/c1-3-2/h1-2H3,1,COSMOtherm,6.7,21.0,,0.0,0 +dimethylsulfoxide,CS(C)=O,InChI=1/C2H6OS/c1-4(2)3/h1-2H3,1,COSMOtherm,47.24,20.0,,1.0,0 +dioxane,C1COCCO1,InChI=1/C4H8O2/c1-2-6-4-3-5-1/h1-4H2,1,COSMOtherm,2.2189,20.0,,0.0,0 +diphenylether,c1ccc(Oc2ccccc2)cc1,InChI=1/C12H10O/c1-3-7-11(8-4-1)13-12-9-5-2-6-10-12/h1-10H,1,COSMOtherm,3.726,10.0,,0.0,0 +dodecane,CCCCCCCCCCCC,"InChI=1/C12H26/c1-3-5-7-9-11-12-10-8-6-4-2/h3-12H2,1-2H3",4,COSMObase,2.012,20.0,,0.0,0 +dodecanol,CCCCCCCCCCCCO,"InChI=1/C12H26O/c1-2-3-4-5-6-7-8-9-10-11-12-13/h13H,2-12H2,1H3",3,COSMObase,5.82,30.0,,0.0,1 +ethanol,CCO,"InChI=1/C2H6O/c1-2-3/h3H,2H2,1H3",2,COSMOtherm,25.3,20.0,,1.0,1 +ethanolamine,NCCO,"InChI=1/C2H7NO/c3-1-2-4/h4H,1-3H2",10,COSMOtherm,31.94,20.0,,1.0,1 +ethoxybenzene,CCOc1ccccc1,"InChI=1/C8H10O/c1-2-9-8-6-4-3-5-7-8/h3-7H,2H2,1H3",1,COSMOtherm,4.216,20.0,,0.0,0 +ethoxyethanol,CCOCCO,"InChI=1/C4H10O2/c1-2-6-4-3-5/h5H,2-4H2,1H3",10,COSMOtherm,13.38,25.0,,0.0,1 +ethylacetate,CCOC(C)=O,"InChI=1/C4H8O2/c1-3-6-4(2)5/h3H2,1-2H3",3,COSMOtherm,6.0814,20.0,,0.0,0 +ethylamine,CCN,"InChI=1/C2H7N/c1-2-3/h2-3H2,1H3",2,COSMOtherm,8.7,0.0,,0.0,1 +ethylbenzene,CCc1ccccc1,"InChI=1/C8H10/c1-2-8-6-4-3-5-7-8/h3-7H,2H2,1H3",1,COSMOtherm,2.4463,20.0,,0.0,0 +ethylbenzoate,CCOC(=O)c1ccccc1,"InChI=1/C9H10O2/c1-2-11-9(10)8-6-4-3-5-7-8/h3-7H,2H2,1H3",2,COSMObase,6.2,20.0,,0.0,0 +ethylbutyrate,CCCC(=O)OCC,"InChI=1/C6H12O2/c1-3-5-6(7)8-4-2/h3-5H2,1-2H3",5,COSMObase,5.18,28.0,,0.0,0 +ethylenecyanohydrin,N#CCCO,"InChI=1/C3H5NO/c4-2-1-3-5/h5H,1,3H2",5,COSMObase,,,,,1 +ethyleneoxide,C1CO1,InChI=1/C2H4O/c1-2-3-1/h1-2H2,1,COSMObase,12.42,20.0,,0.0,0 +ethylpropylether,CCCOCC,"InChI=1/C5H12O/c1-3-5-6-4-2/h3-5H2,1-2H3",2,COSMObase,,,,,0 +fluorobenzene,Fc1ccccc1,InChI=1/C6H5F/c7-6-4-2-1-3-5-6/h1-5H,1,COSMOtherm,5.465,20.0,,0.0,0 +formamide,NC=O,"InChI=1/CH3NO/c2-1-3/h1H,(H2,2,3)/f/h2H2",1,COSMObase,111.0,20.0,,1.0,1 +furfural,O=Cc1ccco1,InChI=1/C5H4O2/c6-4-5-2-1-3-7-5/h1-4H,2,COSMOtherm,42.1,20.0,,1.0,0 +gamma-butyrolactone,O=C1CCCO1,InChI=1/C4H6O2/c5-4-2-1-3-6-4/h1-3H2,1,COSMObase,39.0,20.0,,1.0,0 +glycerol,OCC(O)CO,"InChI=1/C3H8O3/c4-1-3(6)2-5/h3-6H,1-2H2",29,COSMObase,46.53,20.0,,1.0,1 +glycol,OCCO,"InChI=1/C2H6O2/c3-1-2-4/h3-4H,1-2H2",8,COSMObase,41.4,20.0,,1.0,1 +h2o,O,InChI=1/H2O/h1H2,1,COSMOtherm,80.1,20.0,,1.0,1 +hexafluorobenzene,Fc1c(F)c(F)c(F)c(F)c1F,InChI=1/C6F6/c7-1-2(8)4(10)6(12)5(11)3(1)9,1,COSMOtherm,2.029,25.0,,0.0,0 +hexamethylphosphoramide,CN(C)P(=O)(N(C)C)N(C)C,"InChI=1/C6H18N3OP/c1-7(2)11(10,8(3)4)9(5)6/h1-6H3",5,COSMObase,31.3,20.0,,1.0,0 +hexane,CCCCCC,"InChI=1/C6H14/c1-3-5-6-4-2/h3-6H2,1-2H3",1,COSMOtherm,1.8865,20.0,,0.0,0 +hexanedinitrile,N#CCCCCC#N,InChI=1/C6H8N2/c7-5-3-1-2-4-6-8/h1-4H2,6,COSMOtherm,,,,,0 +hexanedioicacid-bis(2-ethylhexyl)ester,CCCCC(CC)COC(=O)CCCCC(=O)OCC(CC)CCCC,"InChI=1/C22H42O4/c1-5-9-13-19(7-3)17-25-21(23)15-11-12-16-22(24)26-18-20(8-4)14-10-6-2/h19-20H,5-18H2,1-4H3",13,COSMObase,,,,,0 +hexanenitrile,CCCCCC#N,"InChI=1/C6H11N/c1-2-3-4-5-6-7/h2-5H2,1H3",2,COSMOtherm,17.26,25.0,,0.0,0 +iodobenzene,Ic1ccccc1,InChI=1/C6H5I/c7-6-4-2-1-3-5-6/h1-5H,1,COSMOtherm,4.59,20.0,,0.0,0 +iodoethane,CCI,"InChI=1/C2H5I/c1-2-3/h2H2,1H3",1,COSMOtherm,7.82,20.0,,0.0,0 +isobutane,CC(C)C,"InChI=1/C4H10/c1-4(2)3/h4H,1-3H3",1,COSMObase,1.7518,22.0,,0.0,0 +isobutanol,CC(C)CO,"InChI=1/C4H10O/c1-4(2)3-5/h4-5H,3H2,1-2H3",3,COSMObase,17.93,20.0,,0.0,1 +isobutylbenzene,CC(C)Cc1ccccc1,"InChI=1/C10H14/c1-9(2)8-10-6-4-3-5-7-10/h3-7,9H,8H2,1-2H3",1,COSMObase,2.318,20.0,,0.0,0 +isopentanol,CC(C)CCO,"InChI=1/C5H12O/c1-5(2)3-4-6/h5-6H,3-4H2,1-2H3",6,COSMOtherm,15.63,20.0,,0.0,1 +isopropylacetate,CC(=O)OC(C)C,"InChI=1/C5H10O2/c1-4(2)7-5(3)6/h4H,1-3H3",4,COSMObase,,,,,0 +isopropylamine,CC(C)N,"InChI=1/C3H9N/c1-3(2)4/h3H,4H2,1-2H3",3,COSMObase,5.6268,20.0,,0.0,1 +isopropylbenzene,CC(C)c1ccccc1,"InChI=1/C9H12/c1-8(2)9-6-4-3-5-7-9/h3-8H,1-2H3",1,COSMOtherm,2.381,20.0,,0.0,0 +methanol,CO,"InChI=1/CH4O/c1-2/h2H,1H3",1,COSMOtherm,33.0,20.0,,1.0,1 +methyl-t-butylether,COC(C)(C)C,"InChI=1/C5H12O/c1-5(2,3)6-4/h1-4H3",1,COSMObase,,,,,0 +methyl-tert-amylether,CCC(C)(C)OC,"InChI=1/C6H14O/c1-5-6(2,3)7-4/h5H2,1-4H3",5,COSMObase,,,,,0 +methylacetate,COC(C)=O,InChI=1/C3H6O2/c1-3(4)5-2/h1-2H3,2,COSMOtherm,7.07,15.0,,0.0,0 +methylbutane,CCC(C)C,"InChI=1/C5H12/c1-4-5(2)3/h5H,4H2,1-3H3",1,COSMOtherm,1.845,20.0,,0.0,0 +methylbutyrate,CCCC(=O)OC,"InChI=1/C5H10O2/c1-3-4-5(6)7-2/h3-4H2,1-2H3",3,COSMOtherm,,,,,0 +methylcyclohexane,CC1CCCCC1,"InChI=1/C7H14/c1-7-5-3-2-4-6-7/h7H,2-6H2,1H3",1,COSMOtherm,2.024,20.0,,0.0,0 +methylcyclopentane,CC1CCCC1,"InChI=1/C6H12/c1-6-4-2-3-5-6/h6H,2-5H2,1H3",1,COSMOtherm,1.9853,20.0,,0.0,0 +methylformamide,CNC=O,"InChI=1/C2H5NO/c1-3-2-4/h2H,1H3,(H,3,4)/f/h3H",2,COSMObase,189.0,20.0,,1.0,1 +methyln-butylether,CCCCOC,"InChI=1/C5H12O/c1-3-4-5-6-2/h3-5H2,1-2H3",2,COSMObase,,,,,0 +n-butylacetate,CCCCOC(C)=O,"InChI=1/C6H12O2/c1-3-4-5-8-6(2)7/h3-5H2,1-2H3",5,COSMObase,5.07,20.0,,0.0,0 +n-decane,CCCCCCCCCC,"InChI=1/C10H22/c1-3-5-7-9-10-8-6-4-2/h3-10H2,1-2H3",5,COSMObase,1.9853,20.0,,0.0,0 +n-ethylacetamide,CCNC(C)=O,"InChI=1/C4H9NO/c1-3-5-4(2)6/h3H2,1-2H3,(H,5,6)/f/h5H",2,COSMObase,135.0,20.0,,1.0,1 +n-formylethylamine,CCNC=O,"InChI=1/C3H7NO/c1-2-4-3-5/h3H,2H2,1H3,(H,4,5)/f/h4H",3,COSMObase,102.7,25.0,,1.0,1 +n-formylmorpholine,O=CN1CCOCC1,"InChI=1/C5H9NO2/c7-5-6-1-3-8-4-2-6/h5H,1-4H2",1,COSMObase,,,,,0 +n-heptane,CCCCCCC,"InChI=1/C7H16/c1-3-5-7-6-4-2/h3-7H2,1-2H3",1,COSMOtherm,1.9209,20.0,,0.0,0 +n-hexadecane,CCCCCCCCCCCCCCCC,"InChI=1/C16H34/c1-3-5-7-9-11-13-15-16-14-12-10-8-6-4-2/h3-16H2,1-2H3",3,COSMObase,2.046,20.0,,0.0,0 +n-hexylacetate,CCCCCCOC(C)=O,"InChI=1/C8H16O2/c1-3-4-5-6-7-10-8(2)9/h3-7H2,1-2H3",6,COSMObase,4.42,20.0,,0.0,0 +n-methyl-2-pyrrolidinone,CN1CCCC1=O,"InChI=1/C5H9NO/c1-6-4-2-3-5(6)7/h2-4H2,1H3",1,COSMObase,32.55,20.0,,1.0,0 +n-methylacetamide,CNC(C)=O,"InChI=1/C3H7NO/c1-3(5)4-2/h1-2H3,(H,4,5)/f/h4H",1,COSMObase,179.0,30.0,,1.0,1 +n-nonane,CCCCCCCCC,"InChI=1/C9H20/c1-3-5-7-9-8-6-4-2/h3-9H2,1-2H3",6,COSMObase,1.9722,20.0,,0.0,0 +n-pentylacetate,CCCCCOC(C)=O,"InChI=1/C7H14O2/c1-3-4-5-6-9-7(2)8/h3-6H2,1-2H3",4,COSMObase,4.79,20.0,,0.0,0 +n-propylacetate,CCCOC(C)=O,"InChI=1/C5H10O2/c1-3-4-7-5(2)6/h3-4H2,1-2H3",4,COSMObase,5.62,20.0,,0.0,0 +n-propylamine,CCCN,"InChI=1/C3H9N/c1-2-3-4/h2-4H2,1H3",3,COSMOtherm,5.08,23.0,,0.0,1 +n-undecane,CCCCCCCCCCC,"InChI=1/C11H24/c1-3-5-7-9-11-10-8-6-4-2/h3-11H2,1-2H3",5,COSMObase,1.9972,20.0,,0.0,0 +"n,n-dibutylformamide",CCCCN(C=O)CCCC,"InChI=1/C9H19NO/c1-3-5-7-10(9-11)8-6-4-2/h9H,3-8H2,1-2H3",6,COSMObase,18.4,20.0,,0.0,0 +"n,n-dimethylacetamide",CC(=O)N(C)C,InChI=1/C4H9NO/c1-4(6)5(2)3/h1-3H3,1,COSMOtherm,38.85,21.0,,1.0,0 +"n,n-dioctyl-1-octanamine",CCCCCCCCN(CCCCCCCC)CCCCCCCC,"InChI=1/C24H51N/c1-4-7-10-13-16-19-22-25(23-20-17-14-11-8-5-2)24-21-18-15-12-9-6-3/h4-24H2,1-3H3",17,COSMObase,,,,,0 +"n,n-dipentyl-1-pentanamine",CCCCCN(CCCCC)CCCCC,"InChI=1/C15H33N/c1-4-7-10-13-16(14-11-8-5-2)15-12-9-6-3/h4-15H2,1-3H3",18,COSMObase,,,,,0 +nitrobenzene,O=[N+]([O-])c1ccccc1,InChI=1/C6H5NO2/c8-7(9)6-4-2-1-3-5-6/h1-5H,1,COSMOtherm,35.6,20.0,,1.0,0 +nitroethane,CC[N+](=O)[O-],"InChI=1/C2H5NO2/c1-2-3(4)5/h2H2,1H3",2,COSMOtherm,29.11,15.0,,1.0,0 +nitromethane,C[N+](=O)[O-],InChI=1/CH3NO2/c1-2(3)4/h1H3,1,COSMOtherm,37.27,20.0,,1.0,0 +octamethylcyclotetrasiloxane,C[Si]1(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O1,"InChI=1/C8H24O4Si4/c1-13(2)9-14(3,4)11-16(7,8)12-15(5,6)10-13/h1-8H3",7,COSMObase,2.39,23.0,,0.0,0 +octane,CCCCCCCC,"InChI=1/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3",2,COSMOtherm,1.948,20.0,,0.0,0 +octene,C=CCCCCCC,"InChI=1/C8H16/c1-3-5-7-8-6-4-2/h3H,1,4-8H2,2H3",2,COSMObase,2.113,20.0,,0.0,0 +pentadecane,CCCCCCCCCCCCCCC,"InChI=1/C15H32/c1-3-5-7-9-11-13-15-14-12-10-8-6-4-2/h3-15H2,1-2H3",2,COSMObase,2.0391,20.0,,0.0,0 +pentane,CCCCC,"InChI=1/C5H12/c1-3-5-4-2/h3-5H2,1-2H3",1,COSMOtherm,1.8371,20.0,,0.0,0 +pentanedinitrile,N#CCCCC#N,InChI=1/C5H6N2/c6-4-2-1-3-5-7/h1-3H2,4,COSMObase,,,,,0 +perfluoro-n-hexane,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,"InChI=1/C6F14/c7-1(8,3(11,12)5(15,16)17)2(9,10)4(13,14)6(18,19)20",1,COSMObase,1.76,25.0,,0.0,0 +perfluoroheptane,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,"InChI=1/C7F16/c8-1(9,2(10,11)4(14,15)6(18,19)20)3(12,13)5(16,17)7(21,22)23",1,COSMObase,1.847,16.0,,0.0,0 +perfluorooctane,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,"InChI=1/C8F18/c9-1(10,3(13,14)5(17,18)7(21,22)23)2(11,12)4(15,16)6(19,20)8(24,25)26",1,COSMObase,,,,,0 +phenylacetonitrile,N#CCc1ccccc1,"InChI=1/C8H7N/c9-7-6-8-4-2-1-3-5-8/h1-5H,6H2",2,COSMObase,17.87,26.0,,0.0,0 +piperidine,C1CCNCC1,"InChI=1/C5H11N/c1-2-4-6-5-3-1/h6H,1-5H2",2,COSMOtherm,4.33,20.0,,0.0,1 +propane,CCC,"InChI=1/C3H8/c1-3-2/h3H2,1-2H3",1,COSMOtherm,1.6678,20.0,,0.0,0 +propanol,CCCO,"InChI=1/C3H8O/c1-2-3-4/h4H,2-3H2,1H3",4,COSMOtherm,20.8,20.0,,1.0,1 +propanone,CC(C)=O,InChI=1/C3H6O/c1-3(2)4/h1-2H3,1,COSMOtherm,21.01,20.0,,1.0,0 +propionitrile,CCC#N,"InChI=1/C3H5N/c1-2-3-4/h2H2,1H3",1,COSMOtherm,29.7,20.0,,1.0,0 +propylenecarbonate,CC1COC(=O)O1,"InChI=1/C4H6O3/c1-3-2-6-4(5)7-3/h3H,2H2,1H3",1,COSMObase,66.14,20.0,,1.0,0 +propyleneglycol,CC(O)CO,"InChI=1/C3H8O2/c1-3(5)2-4/h3-5H,2H2,1H3",10,COSMObase,27.5,30.0,,1.0,1 +pyridine,c1ccncc1,InChI=1/C5H5N/c1-2-4-6-5-3-1/h1-5H,1,COSMOtherm,13.26,20.0,,0.0,0 +pyrrolidine,C1CCNC1,"InChI=1/C4H9N/c1-2-4-5-3-1/h5H,1-4H2",2,COSMOtherm,8.3,20.0,,0.0,1 +sec-butylacetate,CCC(C)OC(C)=O,"InChI=1/C6H12O2/c1-4-5(2)8-6(3)7/h5H,4H2,1-3H3",4,COSMObase,5.135,20.0,,0.0,0 +so2,O=S=O,InChI=1/O2S/c1-3-2,1,COSMObase,14.3,20.0,,0.0,0 +styrene,C=Cc1ccccc1,"InChI=1/C8H8/c1-2-8-6-4-3-5-7-8/h2-7H,1H2",1,COSMOtherm,2.4737,20.0,,0.0,0 +tert-butanol,CC(C)(C)O,"InChI=1/C4H10O/c1-4(2,3)5/h5H,1-3H3",1,COSMOtherm,18.0,20.0,,0.0,1 +tert-butylacetate,CC(=O)OC(C)(C)C,"InChI=1/C6H12O2/c1-5(7)8-6(2,3)4/h1-4H3",2,COSMObase,5.672,20.0,,0.0,0 +tetrachloroethene,ClC(Cl)=C(Cl)Cl,InChI=1/C2Cl4/c3-1(4)2(5)6,1,COSMOtherm,2.268,30.0,,0.0,0 +tetradecane,CCCCCCCCCCCCCC,"InChI=1/C14H30/c1-3-5-7-9-11-13-14-12-10-8-6-4-2/h3-14H2,1-2H3",3,COSMObase,2.0343,20.0,,0.0,0 +tetrahydrofurfurylalcohol,OCC1CCCO1,"InChI=1/C5H10O2/c6-4-5-2-1-3-7-5/h5-6H,1-4H2",11,COSMObase,13.48,30.0,,0.0,1 +"tetrahydrothiophene-1,1-dioxide",O=S1(=O)CCCC1,InChI=1/C4H8O2S/c5-7(6)3-1-2-4-7/h1-4H2,1,COSMObase,,,,,0 +tetralin,c1ccc2c(c1)CCCC2,"InChI=1/C10H12/c1-2-6-10-8-4-3-7-9(10)5-1/h1-2,5-6H,3-4,7-8H2",1,COSMObase,2.771,25.0,,0.0,0 +thf,C1CCOC1,InChI=1/C4H8O/c1-2-4-5-3-1/h1-4H2,1,COSMOtherm,7.52,22.0,,0.0,0 +thiophene,c1ccsc1,InChI=1/C4H4S/c1-2-4-5-3-1/h1-4H,1,COSMOtherm,2.739,20.0,,0.0,0 +thp,C1CCOCC1,InChI=1/C5H10O/c1-2-4-6-5-3-1/h1-5H2,1,COSMObase,5.66,20.0,,0.0,0 +toluene,Cc1ccccc1,"InChI=1/C7H8/c1-7-5-3-2-4-6-7/h2-6H,1H3",1,COSMOtherm,2.379,23.0,,0.0,0 +trans-decalin,C1CCC2CCCCC2C1,"InChI=1/C10H18/c1-2-6-10-8-4-3-7-9(10)5-1/h9-10H,1-8H2/t9-,10-",1,COSMObase,2.184,20.0,,0.0,0 +tri-n-butylamine,CCCCN(CCCC)CCCC,"InChI=1/C12H27N/c1-4-7-10-13(11-8-5-2)12-9-6-3/h4-12H2,1-3H3",11,COSMObase,2.34,20.0,,0.0,0 +tri-n-butylphosphate,CCCCOP(=O)(OCCCC)OCCCC,"InChI=1/C12H27O4P/c1-4-7-10-14-17(13,15-11-8-5-2)16-12-9-6-3/h4-12H2,1-3H3",35,COSMObase,8.34,20.0,,0.0,0 +tricaprylin,CCCCCCCC(=O)OCC(COC(=O)CCCCCCC)OC(=O)CCCCCCC,"InChI=1/C27H50O6/c1-4-7-10-13-16-19-25(28)31-22-24(33-27(30)21-18-15-12-9-6-3)23-32-26(29)20-17-14-11-8-5-2/h24H,4-23H2,1-3H3",20,COSMObase,,,,,0 +tridecane,CCCCCCCCCCCCC,"InChI=1/C13H28/c1-3-5-7-9-11-13-12-10-8-6-4-2/h3-13H2,1-2H3",3,COSMObase,2.0213,20.0,,0.0,0 +triethylamine,CCN(CC)CC,"InChI=1/C6H15N/c1-4-7(5-2)6-3/h4-6H2,1-3H3",4,COSMOtherm,2.418,20.0,,0.0,0 +triethyleneglycol,OCCOCCOCCO,"InChI=1/C6H14O4/c7-1-3-9-5-6-10-4-2-8/h7-8H,1-6H2",27,COSMObase,23.69,20.0,,1.0,1 +triethylphosphate,CCOP(=O)(OCC)OCC,"InChI=1/C6H15O4P/c1-4-8-11(7,9-5-2)10-6-3/h4-6H2,1-3H3",12,COSMObase,13.2,25.0,,0.0,0 +trimethylphosphate,COP(=O)(OC)OC,"InChI=1/C3H9O4P/c1-5-8(4,6-2)7-3/h1-3H3",6,COSMObase,20.6,20.0,,1.0,0 +valeronitrile,CCCCC#N,"InChI=1/C5H9N/c1-2-3-4-5-6/h2-4H2,1H3",1,COSMObase,20.04,20.0,,1.0,0 diff --git a/scripts/solvation/split_nonts_solvation_data.ipynb b/scripts/solvation/split_nonts_solvation_data.ipynb new file mode 100644 index 0000000..44b2c6a --- /dev/null +++ b/scripts/solvation/split_nonts_solvation_data.ipynb @@ -0,0 +1,915 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "18a969de", + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "\n", + "import pandas as pd\n", + "import swifter\n", + "\n", + "from rdkit import Chem\n", + "from rdkit.Chem import Descriptors\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e88bfd35", + "metadata": {}, + "outputs": [], + "source": [ + "NOTEBOOK_DIR = pathlib.Path().absolute()\n", + "DATA_DIR = NOTEBOOK_DIR.parents[1] / \"data\" / \"solvation\"\n", + "QUANTUM_GREEN_DIR = pathlib.Path(\"/home/shared/projects/quantum_green\")\n", + "PAPER_DATA_DIR = QUANTUM_GREEN_DIR / \"paper\" / \"data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b399394c", + "metadata": {}, + "outputs": [], + "source": [ + "def molecule_from_smiles(smiles, remove_atom_mapping=False):\n", + " mol = Chem.MolFromSmiles(smiles)\n", + " if remove_atom_mapping:\n", + " for atom in mol.GetAtoms():\n", + " atom.SetAtomMapNum(0)\n", + " return mol\n", + "\n", + "\n", + "def canonical_smiles_from_molecule(mol, isomeric=True):\n", + " return Chem.MolToSmiles(mol, isomericSmiles=isomeric)\n", + "\n", + "\n", + "def first_letter(name):\n", + " return next(s for s in name if s.isalpha())\n", + "\n", + "\n", + "pd.set_option(\"display.max_columns\", None)\n", + "\n", + "\n", + "def head(df, n=2):\n", + " display(df.head(n))\n", + " print(f\"Contains {len(df)} rows\")\n", + "\n", + "\n", + "def swifter_apply(series, func, desc=None):\n", + " if desc is None:\n", + " desc = \"Applying function\"\n", + " return series.swifter.progress_bar(True, desc).apply(func)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6576962f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cosmo_namesmilesinchicosmo_confsourceexp_dielectricTsource.1polarprotic
0(1,1-dimethylethyl)benzeneCC(C)(C)c1ccccc1InChI=1/C10H14/c1-10(2,3)9-7-5-4-6-8-9/h4-8H,1...1COSMObase2.35920.0NaN0.00
11-(1,1-dimethylethoxy)-2-propanolCC(O)COC(C)(C)CInChI=1/C7H16O2/c1-6(8)5-9-7(2,3)4/h6,8H,5H2,1...5COSMObaseNaNNaNNaNNaN1
\n", + "
" + ], + "text/plain": [ + " cosmo_name smiles \\\n", + "0 (1,1-dimethylethyl)benzene CC(C)(C)c1ccccc1 \n", + "1 1-(1,1-dimethylethoxy)-2-propanol CC(O)COC(C)(C)C \n", + "\n", + " inchi cosmo_conf source \\\n", + "0 InChI=1/C10H14/c1-10(2,3)9-7-5-4-6-8-9/h4-8H,1... 1 COSMObase \n", + "1 InChI=1/C7H16O2/c1-6(8)5-9-7(2,3)4/h6,8H,5H2,1... 5 COSMObase \n", + "\n", + " exp_dielectric T source.1 polar protic \n", + "0 2.359 20.0 NaN 0.0 0 \n", + "1 NaN NaN NaN NaN 1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 295 rows\n" + ] + } + ], + "source": [ + "names_df = pd.read_csv(pathlib.Path.cwd() / \"solvents.csv\")\n", + "head(names_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3033a8ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_isomeric_canonical_smiles = names_df[\"smiles\"].apply(\n", + " lambda x: canonical_smiles_from_molecule(molecule_from_smiles(x), isomeric=False)\n", + ")\n", + "(names_df[\"smiles\"] != non_isomeric_canonical_smiles).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6a0ba6fa", + "metadata": {}, + "outputs": [], + "source": [ + "smiles_to_name_mapping = names_df.set_index(\"smiles\")[\"cosmo_name\"].to_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "9e8149ec", + "metadata": {}, + "source": [ + "## Species Data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f2104f6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)projecthash
0Oid0[O:1]([O:2][H:4])[H:3]-7.178090-11.557472aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2
1CCCCCCCCOid0[O:1]([O:2][H:4])[H:3]-5.797229-14.171550aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-...
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name solute_smiles Gsolv (kcal/mol) \\\n", + "0 O id0 [O:1]([O:2][H:4])[H:3] -7.178090 \n", + "1 CCCCCCCCO id0 [O:1]([O:2][H:4])[H:3] -5.797229 \n", + "\n", + " Hsolv (kcal/mol) project hash \n", + "0 -11.557472 aug11b InChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2 \n", + "1 -14.171550 aug11b InChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 101513860 rows\n" + ] + } + ], + "source": [ + "data = pd.read_csv(\n", + " PAPER_DATA_DIR / \"solvation\" / \"FILTERED_DEDUPLICATED_full_data_v3.csv\",\n", + " low_memory=False,\n", + ")\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3a485558", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilesnon_isomeric_canonical_smiles
0OO
1CCCCCCCCOCCCCCCCCO
\n", + "
" + ], + "text/plain": [ + " solvent_smiles non_isomeric_canonical_smiles\n", + "0 O O\n", + "1 CCCCCCCCO CCCCCCCCO" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 295 rows\n" + ] + } + ], + "source": [ + "unique_solvents = pd.DataFrame(\n", + " data[\"solvent_smiles\"].unique(), columns=[\"solvent_smiles\"]\n", + ")\n", + "unique_solvents[\"non_isomeric_canonical_smiles\"] = unique_solvents[\n", + " \"solvent_smiles\"\n", + "].apply(\n", + " lambda x: canonical_smiles_from_molecule(molecule_from_smiles(x), isomeric=False)\n", + ")\n", + "head(unique_solvents)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f074b94c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(85)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " unique_solvents[\"solvent_smiles\"]\n", + " != unique_solvents[\"non_isomeric_canonical_smiles\"]\n", + ").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4fceaa13", + "metadata": {}, + "outputs": [], + "source": [ + "solvent_smiles_to_non_isomeric_canonical_smiles_mapping = unique_solvents.set_index(\n", + " \"solvent_smiles\"\n", + ")[\"non_isomeric_canonical_smiles\"].to_dict()\n", + "solvent_smiles_to_name_mapping = {\n", + " k: smiles_to_name_mapping[v]\n", + " for k, v in solvent_smiles_to_non_isomeric_canonical_smiles_mapping.items()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "676d8da8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)projecthashsolvent_name
0Oid0[O:1]([O:2][H:4])[H:3]-7.178090-11.557472aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2h2o
1CCCCCCCCOid0[O:1]([O:2][H:4])[H:3]-5.797229-14.171550aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-...1-octanol
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name solute_smiles Gsolv (kcal/mol) \\\n", + "0 O id0 [O:1]([O:2][H:4])[H:3] -7.178090 \n", + "1 CCCCCCCCO id0 [O:1]([O:2][H:4])[H:3] -5.797229 \n", + "\n", + " Hsolv (kcal/mol) project \\\n", + "0 -11.557472 aug11b \n", + "1 -14.171550 aug11b \n", + "\n", + " hash solvent_name \n", + "0 InChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2 h2o \n", + "1 InChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-... 1-octanol " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 101513860 rows\n" + ] + } + ], + "source": [ + "data[\"solvent_name\"] = data[\"solvent_smiles\"].map(solvent_smiles_to_name_mapping)\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "acd16664", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atom_mapped_smiles
0[O:1]([O:2][H:4])[H:3]
1[C:1]([C:2]([F:3])([H:7])[H:8])([H:4])([H:5])[...
\n", + "
" + ], + "text/plain": [ + " atom_mapped_smiles\n", + "0 [O:1]([O:2][H:4])[H:3]\n", + "1 [C:1]([C:2]([F:3])([H:7])[H:8])([H:4])([H:5])[..." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 344164 rows\n" + ] + } + ], + "source": [ + "unique_solutes = pd.DataFrame(\n", + " data[\"solute_smiles\"].unique(), columns=[\"atom_mapped_smiles\"]\n", + ")\n", + "head(unique_solutes)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "40fd40bf", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a9459166fe7479ca75ead6319e29b67", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating mols: 0%| | 0/344164 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atom_mapped_smilescanonical_smilesnum_radical_electrons
0[O:1]([O:2][H:4])[H:3]OO0
1[C:1]([C:2]([F:3])([H:7])[H:8])([H:4])([H:5])[...CCF0
\n", + "" + ], + "text/plain": [ + " atom_mapped_smiles canonical_smiles \\\n", + "0 [O:1]([O:2][H:4])[H:3] OO \n", + "1 [C:1]([C:2]([F:3])([H:7])[H:8])([H:4])([H:5])[... CCF \n", + "\n", + " num_radical_electrons \n", + "0 0 \n", + "1 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 344164 rows\n" + ] + } + ], + "source": [ + "molecules = swifter_apply(\n", + " unique_solutes[\"atom_mapped_smiles\"],\n", + " lambda x: molecule_from_smiles(x, remove_atom_mapping=True),\n", + " desc=\"Generating mols\",\n", + ")\n", + "\n", + "unique_solutes[\"canonical_smiles\"] = swifter_apply(\n", + " molecules,\n", + " Chem.MolToSmiles,\n", + " desc=\"Generating canonical smiles\",\n", + ")\n", + "\n", + "unique_solutes[\"num_radical_electrons\"] = swifter_apply(\n", + " molecules,\n", + " Descriptors.NumRadicalElectrons,\n", + " desc=\"Detecting radical electrons\",\n", + ")\n", + "\n", + "head(unique_solutes)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7ed1d94e", + "metadata": {}, + "outputs": [], + "source": [ + "solute_to_canonical_smiles_mapping = unique_solutes.set_index(\"atom_mapped_smiles\")[\n", + " \"canonical_smiles\"\n", + "].to_dict()\n", + "\n", + "solute_to_num_radical_electrons_mapping = unique_solutes.set_index(\n", + " \"atom_mapped_smiles\"\n", + ")[\"num_radical_electrons\"].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "78f244d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)projecthashsolvent_namesmilesnum_radical_electrons
0Oid0[O:1]([O:2][H:4])[H:3]-7.178090-11.557472aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2h2oOO0
1CCCCCCCCOid0[O:1]([O:2][H:4])[H:3]-5.797229-14.171550aug11bInChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-...1-octanolOO0
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name solute_smiles Gsolv (kcal/mol) \\\n", + "0 O id0 [O:1]([O:2][H:4])[H:3] -7.178090 \n", + "1 CCCCCCCCO id0 [O:1]([O:2][H:4])[H:3] -5.797229 \n", + "\n", + " Hsolv (kcal/mol) project \\\n", + "0 -11.557472 aug11b \n", + "1 -14.171550 aug11b \n", + "\n", + " hash solvent_name smiles \\\n", + "0 InChI=1/H2O2/c1-2/h1-2HInChI=1S/H2O/h1H2 h2o OO \n", + "1 InChI=1/H2O2/c1-2/h1-2HInChI=1S/C8H18O/c1-2-3-... 1-octanol OO \n", + "\n", + " num_radical_electrons \n", + "0 0 \n", + "1 0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 101513860 rows\n" + ] + } + ], + "source": [ + "data[\"smiles\"] = data[\"solute_smiles\"].map(solute_to_canonical_smiles_mapping.get)\n", + "data[\"num_radical_electrons\"] = data[\"solute_smiles\"].map(\n", + " solute_to_num_radical_electrons_mapping.get\n", + ")\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f647ba78", + "metadata": {}, + "outputs": [], + "source": [ + "ouput_columns = [\"smiles\", \"Gsolv (kcal/mol)\", \"Hsolv (kcal/mol)\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d98547ac", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_solvents = data.groupby([\"solvent_name\", \"num_radical_electrons\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d0f2ea8b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b7c7f4d100c540c89bc5ca2327272a0b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/590 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cosmo_namesmilesinchicosmo_confsourceexp_dielectricTsource.1polarprotic
0(1,1-dimethylethyl)benzeneCC(C)(C)c1ccccc1InChI=1/C10H14/c1-10(2,3)9-7-5-4-6-8-9/h4-8H,1...1COSMObase2.35920.0NaN0.00
11-(1,1-dimethylethoxy)-2-propanolCC(O)COC(C)(C)CInChI=1/C7H16O2/c1-6(8)5-9-7(2,3)4/h6,8H,5H2,1...5COSMObaseNaNNaNNaNNaN1
\n", + "" + ], + "text/plain": [ + " cosmo_name smiles \\\n", + "0 (1,1-dimethylethyl)benzene CC(C)(C)c1ccccc1 \n", + "1 1-(1,1-dimethylethoxy)-2-propanol CC(O)COC(C)(C)C \n", + "\n", + " inchi cosmo_conf source \\\n", + "0 InChI=1/C10H14/c1-10(2,3)9-7-5-4-6-8-9/h4-8H,1... 1 COSMObase \n", + "1 InChI=1/C7H16O2/c1-6(8)5-9-7(2,3)4/h6,8H,5H2,1... 5 COSMObase \n", + "\n", + " exp_dielectric T source.1 polar protic \n", + "0 2.359 20.0 NaN 0.0 0 \n", + "1 NaN NaN NaN NaN 1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 295 rows\n" + ] + } + ], + "source": [ + "names_df = pd.read_csv(pathlib.Path.cwd() / \"solvents.csv\")\n", + "head(names_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d62c4994", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_isomeric_canonical_smiles = names_df[\"smiles\"].apply(\n", + " lambda x: canonical_smiles(x, isomeric=False)\n", + ")\n", + "(names_df[\"smiles\"] != non_isomeric_canonical_smiles).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1fcd250e", + "metadata": {}, + "outputs": [], + "source": [ + "smiles_to_name_mapping = names_df.set_index(\"smiles\")[\"cosmo_name\"].to_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "e57c6323", + "metadata": {}, + "source": [ + "## Transition State Data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2a46f378", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)r1r2p1p2r1_Gsolvr2_Gsolvp1_Gsolvp2_GsolvDDGsolv_forward (kcal/mol)DDGsolv_reverse (kcal/mol)r1_Hsolvr2_Hsolvp1_Hsolvp2_HsolvDDHsolv_forward (kcal/mol)DDHsolv_reverse (kcal/mol)
0CCCCOP(OCCCC)(OCCCC)=O52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-12.065855-21.383296[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-7.216970-7.000390-6.952042-8.1066572.1515042.992844-13.229345-12.452517-12.267439-16.9559984.2985667.840141
1CCCCCCCCCCCCC52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-7.794416-14.052868[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-0.905423-6.425314-6.485875-1.134200-0.463679-0.174341-3.678896-11.593300-11.621092-4.3251111.2193281.893335
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name \\\n", + "0 CCCCOP(OCCCC)(OCCCC)=O 52940 \n", + "1 CCCCCCCCCCCCC 52940 \n", + "\n", + " solute_smiles Gsolv (kcal/mol) \\\n", + "0 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -12.065855 \n", + "1 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -7.794416 \n", + "\n", + " Hsolv (kcal/mol) r1 \\\n", + "0 -21.383296 [O:1][O:2][H:3] \n", + "1 -14.052868 [O:1][O:2][H:3] \n", + "\n", + " r2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "\n", + " p1 p2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "\n", + " r1_Gsolv r2_Gsolv p1_Gsolv p2_Gsolv DDGsolv_forward (kcal/mol) \\\n", + "0 -7.216970 -7.000390 -6.952042 -8.106657 2.151504 \n", + "1 -0.905423 -6.425314 -6.485875 -1.134200 -0.463679 \n", + "\n", + " DDGsolv_reverse (kcal/mol) r1_Hsolv r2_Hsolv p1_Hsolv p2_Hsolv \\\n", + "0 2.992844 -13.229345 -12.452517 -12.267439 -16.955998 \n", + "1 -0.174341 -3.678896 -11.593300 -11.621092 -4.325111 \n", + "\n", + " DDHsolv_forward (kcal/mol) DDHsolv_reverse (kcal/mol) \n", + "0 4.298566 7.840141 \n", + "1 1.219328 1.893335 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 21684582 rows\n" + ] + } + ], + "source": [ + "data = pd.read_csv(\n", + " PAPER_DATA_DIR\n", + " / \"ts_solvation\"\n", + " / \"FINAL_dG_solv_pruned_nov17_with_reactant_product_dGsolv.csv\",\n", + ")\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c12dc02c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilesnon_isomeric_canonical_smiles
0CCCCOP(OCCCC)(OCCCC)=OCCCCOP(=O)(OCCCC)OCCCC
1CCCCCCCCCCCCCCCCCCCCCCCCCC
\n", + "
" + ], + "text/plain": [ + " solvent_smiles non_isomeric_canonical_smiles\n", + "0 CCCCOP(OCCCC)(OCCCC)=O CCCCOP(=O)(OCCCC)OCCCC\n", + "1 CCCCCCCCCCCCC CCCCCCCCCCCCC" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 295 rows\n" + ] + } + ], + "source": [ + "unique_solvents = pd.DataFrame(\n", + " data[\"solvent_smiles\"].unique(), columns=[\"solvent_smiles\"]\n", + ")\n", + "unique_solvents[\"non_isomeric_canonical_smiles\"] = unique_solvents[\n", + " \"solvent_smiles\"\n", + "].apply(lambda x: canonical_smiles(x, isomeric=False))\n", + "head(unique_solvents)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dd3333ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(85)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " unique_solvents[\"solvent_smiles\"]\n", + " != unique_solvents[\"non_isomeric_canonical_smiles\"]\n", + ").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "261462dd", + "metadata": {}, + "outputs": [], + "source": [ + "solvent_smiles_to_non_isomeric_canonical_smiles_mapping = unique_solvents.set_index(\n", + " \"solvent_smiles\"\n", + ")[\"non_isomeric_canonical_smiles\"].to_dict()\n", + "\n", + "solvent_smiles_to_name_mapping = {\n", + " k: smiles_to_name_mapping[v]\n", + " for k, v in solvent_smiles_to_non_isomeric_canonical_smiles_mapping.items()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "857a77a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)r1r2p1p2r1_Gsolvr2_Gsolvp1_Gsolvp2_GsolvDDGsolv_forward (kcal/mol)DDGsolv_reverse (kcal/mol)r1_Hsolvr2_Hsolvp1_Hsolvp2_HsolvDDHsolv_forward (kcal/mol)DDHsolv_reverse (kcal/mol)solvent_name
0CCCCOP(OCCCC)(OCCCC)=O52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-12.065855-21.383296[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-7.216970-7.000390-6.952042-8.1066572.1515042.992844-13.229345-12.452517-12.267439-16.9559984.2985667.840141tri-n-butylphosphate
1CCCCCCCCCCCCC52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-7.794416-14.052868[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-0.905423-6.425314-6.485875-1.134200-0.463679-0.174341-3.678896-11.593300-11.621092-4.3251111.2193281.893335tridecane
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name \\\n", + "0 CCCCOP(OCCCC)(OCCCC)=O 52940 \n", + "1 CCCCCCCCCCCCC 52940 \n", + "\n", + " solute_smiles Gsolv (kcal/mol) \\\n", + "0 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -12.065855 \n", + "1 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -7.794416 \n", + "\n", + " Hsolv (kcal/mol) r1 \\\n", + "0 -21.383296 [O:1][O:2][H:3] \n", + "1 -14.052868 [O:1][O:2][H:3] \n", + "\n", + " r2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "\n", + " p1 p2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "\n", + " r1_Gsolv r2_Gsolv p1_Gsolv p2_Gsolv DDGsolv_forward (kcal/mol) \\\n", + "0 -7.216970 -7.000390 -6.952042 -8.106657 2.151504 \n", + "1 -0.905423 -6.425314 -6.485875 -1.134200 -0.463679 \n", + "\n", + " DDGsolv_reverse (kcal/mol) r1_Hsolv r2_Hsolv p1_Hsolv p2_Hsolv \\\n", + "0 2.992844 -13.229345 -12.452517 -12.267439 -16.955998 \n", + "1 -0.174341 -3.678896 -11.593300 -11.621092 -4.325111 \n", + "\n", + " DDHsolv_forward (kcal/mol) DDHsolv_reverse (kcal/mol) \\\n", + "0 4.298566 7.840141 \n", + "1 1.219328 1.893335 \n", + "\n", + " solvent_name \n", + "0 tri-n-butylphosphate \n", + "1 tridecane " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 21684582 rows\n" + ] + } + ], + "source": [ + "data[\"solvent_name\"] = data[\"solvent_smiles\"].map(solvent_smiles_to_name_mapping)\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9750bd11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atom_mapped_smiles
0[C:1]([C:2]([N:3]([C:4]([C:5]([O:6][H:23])([H:...
1[C:1]([C:2]([C:3]([C:4]1=[N:5][N:6][C:7]([C:8]...
\n", + "
" + ], + "text/plain": [ + " atom_mapped_smiles\n", + "0 [C:1]([C:2]([N:3]([C:4]([C:5]([O:6][H:23])([H:...\n", + "1 [C:1]([C:2]([C:3]([C:4]1=[N:5][N:6][C:7]([C:8]..." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 133253 rows\n" + ] + } + ], + "source": [ + "unique_reactants = pd.DataFrame(\n", + " set().union(*[data[col] for col in [\"r1\", \"r2\", \"p1\", \"p2\"]]),\n", + " columns=[\"atom_mapped_smiles\"],\n", + ")\n", + "head(unique_reactants)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4327c0e7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "305f57f8f3ae4f8dbabb9e8ad3cd21a3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating canonical smiles: 0%| | 0/133253 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atom_mapped_smilescanonical_smiles
0[C:1]([C:2]([N:3]([C:4]([C:5]([O:6][H:23])([H:...CCNC(CO)C(CC)C(N)CC
1[C:1]([C:2]([C:3]([C:4]1=[N:5][N:6][C:7]([C:8]...CCCC1=N[N]C(C)=N1
\n", + "" + ], + "text/plain": [ + " atom_mapped_smiles canonical_smiles\n", + "0 [C:1]([C:2]([N:3]([C:4]([C:5]([O:6][H:23])([H:... CCNC(CO)C(CC)C(N)CC\n", + "1 [C:1]([C:2]([C:3]([C:4]1=[N:5][N:6][C:7]([C:8]... CCCC1=N[N]C(C)=N1" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 133253 rows\n" + ] + } + ], + "source": [ + "unique_reactants[\"canonical_smiles\"] = swifter_apply(\n", + " unique_reactants[\"atom_mapped_smiles\"],\n", + " lambda x: canonical_smiles(x, remove_atom_mapping=True),\n", + " \"Generating canonical smiles\",\n", + ")\n", + "head(unique_reactants)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7c583530", + "metadata": {}, + "outputs": [], + "source": [ + "species_to_canonical_smiles_mapping = unique_reactants.set_index(\n", + " \"atom_mapped_smiles\"\n", + ")[\"canonical_smiles\"].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "db3e0f11", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "80f670bd01db45f391109a06235f0cec", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Mapping r1: 0%| | 0/21684582 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)r1r2p1p2r1_Gsolvr2_Gsolvp1_Gsolvp2_GsolvDDGsolv_forward (kcal/mol)DDGsolv_reverse (kcal/mol)r1_Hsolvr2_Hsolvp1_Hsolvp2_HsolvDDHsolv_forward (kcal/mol)DDHsolv_reverse (kcal/mol)solvent_namer1_smilesr2_smilesp1_smilesp2_smiles
0CCCCOP(OCCCC)(OCCCC)=O52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-12.065855-21.383296[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-7.216970-7.000390-6.952042-8.1066572.1515042.992844-13.229345-12.452517-12.267439-16.9559984.2985667.840141tri-n-butylphosphate[O]OCN1CC2CNCC2C1CN1CC2C[N]CC2C1OO
1CCCCCCCCCCCCC52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-7.794416-14.052868[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-0.905423-6.425314-6.485875-1.134200-0.463679-0.174341-3.678896-11.593300-11.621092-4.3251111.2193281.893335tridecane[O]OCN1CC2CNCC2C1CN1CC2C[N]CC2C1OO
\n", + "" + ], + "text/plain": [ + " solvent_smiles solute_name \\\n", + "0 CCCCOP(OCCCC)(OCCCC)=O 52940 \n", + "1 CCCCCCCCCCCCC 52940 \n", + "\n", + " solute_smiles Gsolv (kcal/mol) \\\n", + "0 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -12.065855 \n", + "1 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -7.794416 \n", + "\n", + " Hsolv (kcal/mol) r1 \\\n", + "0 -21.383296 [O:1][O:2][H:3] \n", + "1 -14.052868 [O:1][O:2][H:3] \n", + "\n", + " r2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "\n", + " p1 p2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "\n", + " r1_Gsolv r2_Gsolv p1_Gsolv p2_Gsolv DDGsolv_forward (kcal/mol) \\\n", + "0 -7.216970 -7.000390 -6.952042 -8.106657 2.151504 \n", + "1 -0.905423 -6.425314 -6.485875 -1.134200 -0.463679 \n", + "\n", + " DDGsolv_reverse (kcal/mol) r1_Hsolv r2_Hsolv p1_Hsolv p2_Hsolv \\\n", + "0 2.992844 -13.229345 -12.452517 -12.267439 -16.955998 \n", + "1 -0.174341 -3.678896 -11.593300 -11.621092 -4.325111 \n", + "\n", + " DDHsolv_forward (kcal/mol) DDHsolv_reverse (kcal/mol) \\\n", + "0 4.298566 7.840141 \n", + "1 1.219328 1.893335 \n", + "\n", + " solvent_name r1_smiles r2_smiles p1_smiles p2_smiles \n", + "0 tri-n-butylphosphate [O]O CN1CC2CNCC2C1 CN1CC2C[N]CC2C1 OO \n", + "1 tridecane [O]O CN1CC2CNCC2C1 CN1CC2C[N]CC2C1 OO " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 21684582 rows\n" + ] + } + ], + "source": [ + "for col in [\"r1\", \"r2\", \"p1\", \"p2\"]:\n", + " data[col + \"_smiles\"] = swifter_apply(\n", + " data[col],\n", + " lambda x: species_to_canonical_smiles_mapping.get(x),\n", + " f\"Mapping {col}\",\n", + " )\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0c64bae7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
solvent_smilessolute_namesolute_smilesGsolv (kcal/mol)Hsolv (kcal/mol)r1r2p1p2r1_Gsolvr2_Gsolvp1_Gsolvp2_GsolvDDGsolv_forward (kcal/mol)DDGsolv_reverse (kcal/mol)r1_Hsolvr2_Hsolvp1_Hsolvp2_HsolvDDHsolv_forward (kcal/mol)DDHsolv_reverse (kcal/mol)solvent_namer1_smilesr2_smilesp1_smilesp2_smilesrxn_smiles
0CCCCOP(OCCCC)(OCCCC)=O52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-12.065855-21.383296[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-7.216970-7.000390-6.952042-8.1066572.1515042.992844-13.229345-12.452517-12.267439-16.9559984.2985667.840141tri-n-butylphosphate[O]OCN1CC2CNCC2C1CN1CC2C[N]CC2C1OO[O]O.CN1CC2CNCC2C1>>CN1CC2C[N]CC2C1.OO
1CCCCCCCCCCCCC52940[H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:...-7.794416-14.052868[O:1][O:2][H:3][C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]...[O:1]([O:2][H:4])[H:3]-0.905423-6.425314-6.485875-1.134200-0.463679-0.174341-3.678896-11.593300-11.621092-4.3251111.2193281.893335tridecane[O]OCN1CC2CNCC2C1CN1CC2C[N]CC2C1OO[O]O.CN1CC2CNCC2C1>>CN1CC2C[N]CC2C1.OO
\n", + "
" + ], + "text/plain": [ + " solvent_smiles solute_name \\\n", + "0 CCCCOP(OCCCC)(OCCCC)=O 52940 \n", + "1 CCCCCCCCCCCCC 52940 \n", + "\n", + " solute_smiles Gsolv (kcal/mol) \\\n", + "0 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -12.065855 \n", + "1 [H:1][O:3][O:2].[H:4][N:25]1[C:19]([H:10])([H:... -7.794416 \n", + "\n", + " Hsolv (kcal/mol) r1 \\\n", + "0 -21.383296 [O:1][O:2][H:3] \n", + "1 -14.052868 [O:1][O:2][H:3] \n", + "\n", + " r2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... \n", + "\n", + " p1 p2 \\\n", + "0 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "1 [C:1]([N:2]1[C:3]([H:13])([H:14])[C:4]2([H:15]... [O:1]([O:2][H:4])[H:3] \n", + "\n", + " r1_Gsolv r2_Gsolv p1_Gsolv p2_Gsolv DDGsolv_forward (kcal/mol) \\\n", + "0 -7.216970 -7.000390 -6.952042 -8.106657 2.151504 \n", + "1 -0.905423 -6.425314 -6.485875 -1.134200 -0.463679 \n", + "\n", + " DDGsolv_reverse (kcal/mol) r1_Hsolv r2_Hsolv p1_Hsolv p2_Hsolv \\\n", + "0 2.992844 -13.229345 -12.452517 -12.267439 -16.955998 \n", + "1 -0.174341 -3.678896 -11.593300 -11.621092 -4.325111 \n", + "\n", + " DDHsolv_forward (kcal/mol) DDHsolv_reverse (kcal/mol) \\\n", + "0 4.298566 7.840141 \n", + "1 1.219328 1.893335 \n", + "\n", + " solvent_name r1_smiles r2_smiles p1_smiles p2_smiles \\\n", + "0 tri-n-butylphosphate [O]O CN1CC2CNCC2C1 CN1CC2C[N]CC2C1 OO \n", + "1 tridecane [O]O CN1CC2CNCC2C1 CN1CC2C[N]CC2C1 OO \n", + "\n", + " rxn_smiles \n", + "0 [O]O.CN1CC2CNCC2C1>>CN1CC2C[N]CC2C1.OO \n", + "1 [O]O.CN1CC2CNCC2C1>>CN1CC2C[N]CC2C1.OO " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Contains 21684582 rows\n" + ] + } + ], + "source": [ + "data[\"rxn_smiles\"] = (\n", + " data[\"r1_smiles\"]\n", + " + \".\"\n", + " + data[\"r2_smiles\"]\n", + " + \">>\"\n", + " + data[\"p1_smiles\"]\n", + " + \".\"\n", + " + data[\"p2_smiles\"]\n", + ")\n", + "head(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "238e8694", + "metadata": {}, + "outputs": [], + "source": [ + "ouput_columns = [\n", + " \"rxn_smiles\",\n", + " \"Gsolv (kcal/mol)\",\n", + " \"r1_Gsolv\",\n", + " \"r2_Gsolv\",\n", + " \"p1_Gsolv\",\n", + " \"p2_Gsolv\",\n", + " \"DDGsolv_forward (kcal/mol)\",\n", + " \"DDGsolv_reverse (kcal/mol)\",\n", + " \"Hsolv (kcal/mol)\",\n", + " \"r1_Hsolv\",\n", + " \"r2_Hsolv\",\n", + " \"p1_Hsolv\",\n", + " \"p2_Hsolv\",\n", + " \"DDHsolv_forward (kcal/mol)\",\n", + " \"DDHsolv_reverse (kcal/mol)\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "81378572", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_solvents = data.groupby(\"solvent_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e8d10a8c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c0aebe7e7d3e4e9faaf4d900b6d4872f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/295 [00:00