@@ -50,6 +50,65 @@ public class StandardizeReaction {
5050
5151 private static final ILoggingTool LOGGER = createLoggingTool (StandardizeReaction .class );
5252
53+ /**
54+ * Common solvents, reagents, and catalysts by canonical SMILES.
55+ * These molecules never participate in bond-changing reactions —
56+ * they facilitate or mediate but their bonds don't change.
57+ */
58+ private static final Set <String > KNOWN_REAGENT_SMILES = new HashSet <>(Arrays .asList (
59+ // Solvents
60+ "ClCCl" , // DCM (dichloromethane)
61+ "ClC(Cl)Cl" , // chloroform
62+ "CC(C)=O" , // acetone
63+ "CCCCCC" , // hexane
64+ "c1ccncc1" , // pyridine (also base)
65+ "CC#N" , // acetonitrile (MeCN)
66+ "CS(C)=O" , // DMSO
67+ "CN(C)C=O" , // DMF
68+ "C1CCOC1" , // THF
69+ "CCOCC" , // diethyl ether
70+ "C1COCCO1" , // 1,4-dioxane
71+ "CO" , // methanol
72+ "CCO" , // ethanol
73+ "CC(C)O" , // isopropanol
74+ "O" , // water
75+ "CC(=O)O" , // acetic acid (when used as solvent)
76+ "CCOC(C)=O" , // ethyl acetate
77+ "c1ccccc1" , // benzene
78+ "Cc1ccccc1" , // toluene
79+ "c1ccc(cc1)C" , // toluene alternate
80+ // Reducing agents
81+ "[Na+]" , // sodium cation
82+ "[K+]" , // potassium cation
83+ "[Li+]" , // lithium cation
84+ "[Cs+]" , // cesium cation
85+ "[NH4+]" , // ammonium
86+ "[Cl-]" , // chloride
87+ "[Br-]" , // bromide
88+ "[I-]" , // iodide
89+ "[OH-]" , // hydroxide
90+ // Inorganic bases/acids
91+ "[Na]O" , // NaOH
92+ "O=S(=O)(O)O" , // sulfuric acid
93+ // Drying agents / dessicants
94+ "O=S(Cl)Cl" , // thionyl chloride (reagent but bonds don't map)
95+ "[Mg+2]" , // magnesium ion
96+ "[Ca+2]" , // calcium ion
97+ "[Zn]" , // zinc
98+ // Borohydride / cyanoborohydride (reducing agents)
99+ "[BH4-]" , // borohydride
100+ "[BH3-]C#N" // cyanoborohydride
101+ ));
102+
103+ /**
104+ * Metal elements commonly found in catalysts.
105+ * Molecules containing these are likely catalysts, not reactants.
106+ */
107+ private static final Set <String > CATALYST_METALS = new HashSet <>(Arrays .asList (
108+ "Pd" , "Pt" , "Rh" , "Ru" , "Ir" , "Ni" , "Cu" , "Fe" ,
109+ "Co" , "Mn" , "Ti" , "Zr" , "Mo" , "W" , "Os" , "Ag" , "Au"
110+ ));
111+
53112 /**
54113 * Standardize a reaction: clean mappings, validate balance, build containers.
55114 *
@@ -132,55 +191,80 @@ public IReaction filterReagents(IReaction reaction) {
132191 List <IAtomContainer > keptReactants = new ArrayList <>();
133192 List <IAtomContainer > reagents = new ArrayList <>();
134193
194+ // Generate canonical SMILES for known-reagent lookup
195+ org .openscience .cdk .smiles .SmilesGenerator smiGen =
196+ new org .openscience .cdk .smiles .SmilesGenerator (
197+ org .openscience .cdk .smiles .SmiFlavor .Canonical );
198+
135199 for (IAtomContainer reactant : reaction .getReactants ().atomContainers ()) {
136200 boolean isReagent = false ;
201+ String reason = "" ;
202+
137203 try {
138- org .openscience .cdk .fingerprint .IBitFingerprint reactantFP = fp .getBitFingerprint (reactant );
139-
140- // Find max similarity to any product
141- double maxSim = 0.0 ;
142- for (org .openscience .cdk .fingerprint .IBitFingerprint prodFP : productFPs ) {
143- if (prodFP != null ) {
144- double sim = Tanimoto .calculate (reactantFP , prodFP );
145- maxSim = Math .max (maxSim , sim );
146- }
204+ // Check 1: Known solvent/reagent by canonical SMILES
205+ String canSmiles = smiGen .create (reactant );
206+ if (KNOWN_REAGENT_SMILES .contains (canSmiles )) {
207+ isReagent = true ;
208+ reason = "known reagent/solvent: " + canSmiles ;
147209 }
148210
149- // If no product resembles this reactant, it's likely a reagent
150- if (maxSim < 0.3 && reactant .getAtomCount () > 0 ) {
151- // Double-check: does any atom type in this molecule appear
152- // exclusively in the reactant (i.e., removed in products)?
153- // If so, it might still be a real reactant (leaving group)
154- boolean hasUniqueContribution = false ;
155- Map <String , Integer > reactantAtomCounts = new LinkedHashMap <>();
211+ // Check 2: Contains catalyst metal
212+ if (!isReagent ) {
156213 for (IAtom atom : reactant .atoms ()) {
157- reactantAtomCounts .merge (atom .getSymbol (), 1 , Integer ::sum );
158- }
159- // Check if this reactant contributes atoms not in products
160- for (Map .Entry <String , Integer > entry : reactantAtomCounts .entrySet ()) {
161- if (!productAtomCounts .containsKey (entry .getKey ())) {
162- hasUniqueContribution = true ;
214+ if (CATALYST_METALS .contains (atom .getSymbol ())) {
215+ isReagent = true ;
216+ reason = "catalyst metal: " + atom .getSymbol ();
163217 break ;
164218 }
165219 }
220+ }
166221
167- // Only filter if: low similarity AND no unique atom contribution
168- // AND molecule is small (≤ 10 heavy atoms) — large molecules
169- // are more likely to be real reactants
170- int heavyAtomCount = 0 ;
171- for (IAtom atom : reactant .atoms ()) {
172- if (!"H" .equals (atom .getSymbol ())) heavyAtomCount ++;
222+ // Check 3: Tanimoto fingerprint similarity
223+ if (!isReagent ) {
224+ org .openscience .cdk .fingerprint .IBitFingerprint reactantFP =
225+ fp .getBitFingerprint (reactant );
226+
227+ double maxSim = 0.0 ;
228+ for (org .openscience .cdk .fingerprint .IBitFingerprint prodFP : productFPs ) {
229+ if (prodFP != null ) {
230+ double sim = Tanimoto .calculate (reactantFP , prodFP );
231+ maxSim = Math .max (maxSim , sim );
232+ }
173233 }
174- if (!hasUniqueContribution && heavyAtomCount <= 10 ) {
175- isReagent = true ;
176- LOGGER .debug ("Filtered reagent/solvent: " + reactant .getID ()
177- + " (Tanimoto=" + String .format ("%.2f" , maxSim )
178- + ", atoms=" + heavyAtomCount + ")" );
234+
235+ if (maxSim < 0.4 && reactant .getAtomCount () > 0 ) {
236+ // Check for unique atom contribution
237+ boolean hasUniqueContribution = false ;
238+ Map <String , Integer > reactantAtomCounts = new LinkedHashMap <>();
239+ for (IAtom atom : reactant .atoms ()) {
240+ reactantAtomCounts .merge (atom .getSymbol (), 1 , Integer ::sum );
241+ }
242+ for (Map .Entry <String , Integer > entry : reactantAtomCounts .entrySet ()) {
243+ if (!productAtomCounts .containsKey (entry .getKey ())) {
244+ hasUniqueContribution = true ;
245+ break ;
246+ }
247+ }
248+
249+ int heavyAtomCount = 0 ;
250+ for (IAtom atom : reactant .atoms ()) {
251+ if (!"H" .equals (atom .getSymbol ())) heavyAtomCount ++;
252+ }
253+
254+ if (!hasUniqueContribution && heavyAtomCount <= 10 ) {
255+ isReagent = true ;
256+ reason = "low Tanimoto=" + String .format ("%.2f" , maxSim )
257+ + ", atoms=" + heavyAtomCount ;
258+ }
179259 }
180260 }
181261 } catch (Exception e ) {
182- // If fingerprinting fails, keep the molecule as reactant
183- LOGGER .debug ("Fingerprint failed for " + reactant .getID () + ": " + e .getMessage ());
262+ LOGGER .debug ("Filter check failed for " + reactant .getID ()
263+ + ": " + e .getMessage ());
264+ }
265+
266+ if (isReagent ) {
267+ LOGGER .debug ("Filtered: " + reason );
184268 }
185269
186270 if (isReagent ) {
0 commit comments