|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "id": "4d2423d0", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "from comptox_ai.db.graph_db import GraphDB\n", |
| 11 | + "import pandas as pd\n", |
| 12 | + "from tqdm import tqdm_notebook" |
| 13 | + ] |
| 14 | + }, |
| 15 | + { |
| 16 | + "cell_type": "code", |
| 17 | + "execution_count": 2, |
| 18 | + "id": "b1ffc150", |
| 19 | + "metadata": {}, |
| 20 | + "outputs": [], |
| 21 | + "source": [ |
| 22 | + "db = GraphDB(hostname=\"neo4j.comptox.ai\")\n", |
| 23 | + "pf_db = pd.read_excel(\"/Users/jdr2160/Downloads/perfluorome database.xlsx\")" |
| 24 | + ] |
| 25 | + }, |
| 26 | + { |
| 27 | + "cell_type": "code", |
| 28 | + "execution_count": 3, |
| 29 | + "id": "b126d43c", |
| 30 | + "metadata": {}, |
| 31 | + "outputs": [], |
| 32 | + "source": [ |
| 33 | + "casrns = list(set([x.split(';')[0] for x in list(pf_db['CAS.RN'])]))" |
| 34 | + ] |
| 35 | + }, |
| 36 | + { |
| 37 | + "cell_type": "code", |
| 38 | + "execution_count": 4, |
| 39 | + "id": "d21cad0f", |
| 40 | + "metadata": {}, |
| 41 | + "outputs": [ |
| 42 | + { |
| 43 | + "data": { |
| 44 | + "text/plain": [ |
| 45 | + "401" |
| 46 | + ] |
| 47 | + }, |
| 48 | + "execution_count": 4, |
| 49 | + "metadata": {}, |
| 50 | + "output_type": "execute_result" |
| 51 | + } |
| 52 | + ], |
| 53 | + "source": [ |
| 54 | + "len(casrns)" |
| 55 | + ] |
| 56 | + }, |
| 57 | + { |
| 58 | + "cell_type": "code", |
| 59 | + "execution_count": 5, |
| 60 | + "id": "6cbc3a53", |
| 61 | + "metadata": {}, |
| 62 | + "outputs": [ |
| 63 | + { |
| 64 | + "name": "stderr", |
| 65 | + "output_type": "stream", |
| 66 | + "text": [ |
| 67 | + "/var/folders/mb/jkjn3zh97jz9dbxlswr4mj8c0000gn/T/ipykernel_18198/3870895096.py:5: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n", |
| 68 | + "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n", |
| 69 | + " for crn in tqdm_notebook(casrns):\n" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | + "data": { |
| 74 | + "application/vnd.jupyter.widget-view+json": { |
| 75 | + "model_id": "879a1596678f42518aa12127da8998eb", |
| 76 | + "version_major": 2, |
| 77 | + "version_minor": 0 |
| 78 | + }, |
| 79 | + "text/plain": [ |
| 80 | + " 0%| | 0/401 [00:00<?, ?it/s]" |
| 81 | + ] |
| 82 | + }, |
| 83 | + "metadata": {}, |
| 84 | + "output_type": "display_data" |
| 85 | + }, |
| 86 | + { |
| 87 | + "name": "stdout", |
| 88 | + "output_type": "stream", |
| 89 | + "text": [ |
| 90 | + "281\n" |
| 91 | + ] |
| 92 | + } |
| 93 | + ], |
| 94 | + "source": [ |
| 95 | + "count = 0\n", |
| 96 | + "matches = []\n", |
| 97 | + "unmatches = []\n", |
| 98 | + "\n", |
| 99 | + "for crn in tqdm_notebook(casrns):\n", |
| 100 | + " res = db.run_cypher(f\"MATCH (c:Chemical {{ xrefCasRN: \\\"{crn}\\\" }} ) RETURN c;\")\n", |
| 101 | + " if len(res) > 0:\n", |
| 102 | + " count += 1\n", |
| 103 | + " matches.append(res[0]['c'])\n", |
| 104 | + " else:\n", |
| 105 | + " unmatches.append(crn)\n", |
| 106 | + " \n", |
| 107 | + "print(count)" |
| 108 | + ] |
| 109 | + }, |
| 110 | + { |
| 111 | + "cell_type": "code", |
| 112 | + "execution_count": 6, |
| 113 | + "id": "78c472c1", |
| 114 | + "metadata": {}, |
| 115 | + "outputs": [ |
| 116 | + { |
| 117 | + "data": { |
| 118 | + "text/plain": [ |
| 119 | + "['147916-78-7',\n", |
| 120 | + " '32848-21-8',\n", |
| 121 | + " '2127-74-4',\n", |
| 122 | + " '476304-39-9',\n", |
| 123 | + " '133310-68-6',\n", |
| 124 | + " '38012-79-2',\n", |
| 125 | + " '215094-37-4',\n", |
| 126 | + " '39492-88-1',\n", |
| 127 | + " '94720-20-4',\n", |
| 128 | + " '102489-65-6',\n", |
| 129 | + " '57325-43-6',\n", |
| 130 | + " '69492-70-2',\n", |
| 131 | + " '19932-26-4',\n", |
| 132 | + " '1274722-59-6',\n", |
| 133 | + " '75668-27-8',\n", |
| 134 | + " '76848-59-4',\n", |
| 135 | + " '220469-13-6',\n", |
| 136 | + " '250738-42-2',\n", |
| 137 | + " '89109-69-3',\n", |
| 138 | + " '678-98-8',\n", |
| 139 | + " '355-88-4',\n", |
| 140 | + " '371771-07-2',\n", |
| 141 | + " '131851-20-2',\n", |
| 142 | + " '106873-68-1',\n", |
| 143 | + " '1207481-10-4',\n", |
| 144 | + " 'NOCAS_893419',\n", |
| 145 | + " '355-93-1',\n", |
| 146 | + " '164792-01-2',\n", |
| 147 | + " '1513864-12-4',\n", |
| 148 | + " '879881-65-9',\n", |
| 149 | + " '355-30-6',\n", |
| 150 | + " '240129-40-2',\n", |
| 151 | + " '68541-02-6',\n", |
| 152 | + " '70501-47-2',\n", |
| 153 | + " '1426894-99-6',\n", |
| 154 | + " '1946796-71-9',\n", |
| 155 | + " '496805-64-2',\n", |
| 156 | + " '314053-71-9',\n", |
| 157 | + " '2707-72-4',\n", |
| 158 | + " '1444474-83-2',\n", |
| 159 | + " '1391033-22-9',\n", |
| 160 | + " '123665-83-8',\n", |
| 161 | + " '4089-61-6',\n", |
| 162 | + " '147545-41-3',\n", |
| 163 | + " '213681-67-5',\n", |
| 164 | + " '171182-86-8',\n", |
| 165 | + " '1433216-51-3',\n", |
| 166 | + " '1030606-42-8',\n", |
| 167 | + " '136909-85-8',\n", |
| 168 | + " '103831-29-4',\n", |
| 169 | + " '94333-56-9',\n", |
| 170 | + " '755-89-5',\n", |
| 171 | + " '377-37-7',\n", |
| 172 | + " '918-32-1',\n", |
| 173 | + " '78755-31-4',\n", |
| 174 | + " '313366-93-7',\n", |
| 175 | + " '1207727-04-5',\n", |
| 176 | + " '102489-67-8',\n", |
| 177 | + " '144862-38-4',\n", |
| 178 | + " '1268707-97-6',\n", |
| 179 | + " '1612778-34-3',\n", |
| 180 | + " '312-81-2',\n", |
| 181 | + " '1980063-68-0',\n", |
| 182 | + " '2708-54-5',\n", |
| 183 | + " '1355553-99-9',\n", |
| 184 | + " '2251-83-4',\n", |
| 185 | + " '72828-80-9',\n", |
| 186 | + " '82721-69-5',\n", |
| 187 | + " '1220100-43-5',\n", |
| 188 | + " '431-63-0',\n", |
| 189 | + " '121633-31-6',\n", |
| 190 | + " '200337-06-0',\n", |
| 191 | + " '36390-03-1',\n", |
| 192 | + " '176702-71-9',\n", |
| 193 | + " '1257261-91-8',\n", |
| 194 | + " '1161941-02-1',\n", |
| 195 | + " '2212-77-3',\n", |
| 196 | + " '146304-76-9',\n", |
| 197 | + " '94158-67-5',\n", |
| 198 | + " '83650-67-3',\n", |
| 199 | + " '867373-18-0',\n", |
| 200 | + " '714975-29-8',\n", |
| 201 | + " '312943-34-3',\n", |
| 202 | + " '113584-32-0',\n", |
| 203 | + " '119206-62-1',\n", |
| 204 | + " '102061-82-5',\n", |
| 205 | + " '100427-76-7',\n", |
| 206 | + " '681443-29-8',\n", |
| 207 | + " '125640-21-3',\n", |
| 208 | + " '1190430-20-6',\n", |
| 209 | + " '307-96-0',\n", |
| 210 | + " '872672-61-2',\n", |
| 211 | + " '238098-38-9',\n", |
| 212 | + " '129846-67-9',\n", |
| 213 | + " '71623-69-3',\n", |
| 214 | + " '213207-95-5',\n", |
| 215 | + " '93393-77-2',\n", |
| 216 | + " '355-99-7',\n", |
| 217 | + " '123613-18-3',\n", |
| 218 | + " '1355554-66-3',\n", |
| 219 | + " '188034-84-6',\n", |
| 220 | + " '679-25-4',\n", |
| 221 | + " '13050-20-9',\n", |
| 222 | + " '90851-71-1',\n", |
| 223 | + " '377-52-6',\n", |
| 224 | + " '355-98-6',\n", |
| 225 | + " '64790-29-0',\n", |
| 226 | + " '1355555-21-3',\n", |
| 227 | + " '375-83-7',\n", |
| 228 | + " '1456734-51-2',\n", |
| 229 | + " '1426840-85-8',\n", |
| 230 | + " '97388-28-8',\n", |
| 231 | + " '1463530-19-9',\n", |
| 232 | + " '17425-25-1',\n", |
| 233 | + " '755-53-3',\n", |
| 234 | + " '53638-09-8',\n", |
| 235 | + " '755-76-0',\n", |
| 236 | + " '424-01-1',\n", |
| 237 | + " '120219-46-7',\n", |
| 238 | + " '152718-74-6']" |
| 239 | + ] |
| 240 | + }, |
| 241 | + "execution_count": 6, |
| 242 | + "metadata": {}, |
| 243 | + "output_type": "execute_result" |
| 244 | + } |
| 245 | + ], |
| 246 | + "source": [ |
| 247 | + "unmatches" |
| 248 | + ] |
| 249 | + }, |
| 250 | + { |
| 251 | + "cell_type": "code", |
| 252 | + "execution_count": 7, |
| 253 | + "id": "fc098451", |
| 254 | + "metadata": {}, |
| 255 | + "outputs": [ |
| 256 | + { |
| 257 | + "data": { |
| 258 | + "text/plain": [ |
| 259 | + "{'commonName': 'Propanamide, N-(2,5-dichloro-4-nitrophenyl)-2,3,3,3-tetrafluoro-2-(trifluoromethyl)-',\n", |
| 260 | + " 'maccs': '0000000000000000000000010000000000000000010000001000000100000010010001100000000000000010000101000000011001100101000010100101000001001111000001010101011001010110111110',\n", |
| 261 | + " 'synonyms': '',\n", |
| 262 | + " 'sMILES': '[O-][N+](=O)C1=CC(Cl)=C(NC(=O)C(F)(C(F)(F)F)C(F)(F)F)C=C1Cl',\n", |
| 263 | + " 'xrefPubchemSID': '315701706',\n", |
| 264 | + " 'xrefDTXSID': 'DTXSID2073772',\n", |
| 265 | + " 'xrefPubchemCID': '176014',\n", |
| 266 | + " 'xrefCasRN': '105923-43-1',\n", |
| 267 | + " 'uri': 'http://jdr.bio/ontologies/comptox.owl#chemical_dtxsid2073772'}" |
| 268 | + ] |
| 269 | + }, |
| 270 | + "execution_count": 7, |
| 271 | + "metadata": {}, |
| 272 | + "output_type": "execute_result" |
| 273 | + } |
| 274 | + ], |
| 275 | + "source": [ |
| 276 | + "matches[0]" |
| 277 | + ] |
| 278 | + }, |
| 279 | + { |
| 280 | + "cell_type": "code", |
| 281 | + "execution_count": null, |
| 282 | + "id": "5aa9ed06", |
| 283 | + "metadata": {}, |
| 284 | + "outputs": [], |
| 285 | + "source": [] |
| 286 | + } |
| 287 | + ], |
| 288 | + "metadata": { |
| 289 | + "kernelspec": { |
| 290 | + "display_name": "Python 3 (ipykernel)", |
| 291 | + "language": "python", |
| 292 | + "name": "python3" |
| 293 | + }, |
| 294 | + "language_info": { |
| 295 | + "codemirror_mode": { |
| 296 | + "name": "ipython", |
| 297 | + "version": 3 |
| 298 | + }, |
| 299 | + "file_extension": ".py", |
| 300 | + "mimetype": "text/x-python", |
| 301 | + "name": "python", |
| 302 | + "nbconvert_exporter": "python", |
| 303 | + "pygments_lexer": "ipython3", |
| 304 | + "version": "3.9.6" |
| 305 | + } |
| 306 | + }, |
| 307 | + "nbformat": 4, |
| 308 | + "nbformat_minor": 5 |
| 309 | +} |
0 commit comments