Coverage for /builds/ase/ase/ase/data/pubchem.py: 73.00%
100 statements
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
1import json
2import urllib.request
3import warnings
4from collections import namedtuple
5from io import StringIO
6from urllib.error import HTTPError, URLError
8from ase.io import read
10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
12PubchemSearch = namedtuple('PubchemSearch', 'search field')
15class PubchemData:
16 """
17 a specialized class for entries from the pubchem database
18 """
20 def __init__(self, atoms, data):
21 self.atoms = atoms
22 self.data = data
24 def get_atoms(self):
25 return self.atoms
27 def get_pubchem_data(self):
28 return self.data
31def search_pubchem_raw(search, field, silent=False):
32 """
33 A helper function for searching pubchem.
35 Parameters:
36 search (str or int):
37 the compound you are searching for. This can be either
38 a common name, CID, or smiles string depending of the
39 `field` you are searching
41 field (str):
42 the particular field you are searching with. Possible values
43 are 'name', 'CID', and 'smiles'.'name' will search common '
44 'names,CID will search the Pubchem Chemical Idenitification '
45 'Numberswhich can be found on their website and smiles'
46 ' searches for compounds with the entered smiles string.
48 returns:
49 data (str):
50 a string containing the raw response from pubchem.
51 """
52 suffix = 'sdf?record_type=3d'
54 url = (
55 f'{base_url}/{field}/{search!s}/{suffix}'
56 if field == 'conformers'
57 else f'{base_url}/compound/{field}/{search!s}/{suffix}'
58 )
59 try:
60 r = urllib.request.urlopen(url)
61 except HTTPError as e:
62 raise ValueError(
63 f'the search term {search} could not be found for the field {field}'
64 ) from e
65 except URLError as e:
66 raise ValueError(
67 "Couldn't reach the pubchem servers, check your internet connection"
68 ) from e
70 # check if there are confomers and warn them if there are
71 if field != 'conformers' and not silent:
72 conformer_ids = available_conformer_search(search, field)
73 if len(conformer_ids) > 1:
74 warnings.warn(
75 f'The structure "{search}" has more than one conformer in '
76 'PubChem. By default, the first conformer is returned, please '
77 'ensure you are using the structure you intend to or use the '
78 '`ase.data.pubchem.pubchem_conformer_search` function'
79 )
81 return r.read().decode('utf-8')
84def parse_pubchem_raw(data):
85 """
86 a helper function for parsing the returned pubchem entries
88 Parameters:
89 data (str):
90 the raw output from pubchem in string form
92 returns:
93 atoms (ASE Atoms Object):
94 An ASE atoms obejct containing the information from
95 pubchem
96 pubchem_data (dict):
97 a dictionary containing the non-structural information
98 from pubchem
100 """
101 if 'PUBCHEM_COMPOUND_CID' not in data:
102 raise Exception('There was a problem with the data returned by PubChem')
103 f_like = StringIO(data)
104 atoms = read(f_like, format='sdf')
106 # check if there are confomers and warn them if there are
108 # further analyze the text returned from pubchem
109 pubchem_data = {}
110 other_info = data.split('END\n')[1]
111 other_info = other_info.split('$')[0] # remove the $$$$ at the end
112 # the strucuture of this string is > <field>\nentry_info\n
113 other_info = other_info.split('> <') # split into the fields
114 for data_field in other_info:
115 if data_field == '':
116 continue
117 field_name, entry_value = data_field.split('>\n')
118 # split it into lines and remove the empty lines
119 entry_value = entry_value.splitlines()
120 entry_value = [a for a in entry_value if a != '']
121 if len(entry_value) == 1:
122 entry_value = entry_value[0]
123 pubchem_data[field_name] = entry_value
124 # recover partial charges
125 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data:
126 # the first entry just contains the number of atoms with charges
127 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]
128 # each subsequent entry contains the index and charge of the atoms
129 atom_charges = [0.0] * len(atoms)
130 for charge in charges:
131 i, charge = charge.split()
132 # indices start at 1
133 atom_charges[int(i) - 1] = float(charge)
134 atoms.set_initial_charges(atom_charges)
135 return atoms, pubchem_data
138def analyze_input(
139 name=None, cid=None, smiles=None, conformer=None, silent=False
140):
141 """
142 helper function to translate keyword arguments from intialization
143 and searching into the search and field that is being asked for
145 Parameters:
146 see `ase.data.pubchem.pubchem_search`
147 returns:
148 search:
149 the search term the user has entered
150 field:
151 the name of the field being asked for
153 """
154 inputs = [name, cid, smiles, conformer]
155 inputs_check = [a is not None for a in [name, cid, smiles, conformer]]
156 input_fields = ['name', 'cid', 'smiles', 'conformers']
158 if inputs_check.count(True) > 1:
159 raise ValueError(
160 'Only one search term my be entered a time.'
161 ' Please pass in only one of the following: '
162 'name, cid, smiles, confomer'
163 )
164 if inputs_check.count(True) == 0:
165 raise ValueError(
166 'No search was entered.'
167 ' Please pass in only one of the following: '
168 'name, cid, smiles, confomer'
169 )
171 # Figure out which input has been passed in
172 index = inputs_check.index(True)
173 field = input_fields[index]
174 search = inputs[index]
176 # convert hash (triple bond) to hex for URL
177 if isinstance(search, str):
178 search = search.replace('#', '%23')
180 return PubchemSearch(search, field)
183def available_conformer_search(search, field) -> list:
184 """
185 Helper function to get the conformer IDs. This searches pubchem for
186 the conformers of a given structure and returns all the confomer ids
187 of a structure.
189 Parameters:
190 search (str or int):
191 the compound you are searching for. This can be either
192 a common name, CID, or smiles string depending of the
193 `field` you are searching
195 field (str):
196 the particular field you are searching with. Possible values
197 are 'name', 'CID', and 'smiles'.'name' will search common '
198 'names,CID will search the Pubchem Chemical Idenitification '
199 'Numberswhich can be found on their website and smiles'
200 ' searches for compounds with the entered smiles string.
202 returns:
203 conformers_ids (list):
204 a list of the conformer IDs from PubChem, this is different
205 than the CID numbers
206 """
207 suffix = 'conformers/JSON'
208 url = f'{base_url}/compound/{field}/{search!s}/{suffix}'
209 try:
210 r = urllib.request.urlopen(url)
211 except HTTPError as e:
212 err = ValueError(
213 f'the search term {search} could not be found for the field {field}'
214 )
215 raise err from e
216 except URLError as e:
217 err = ValueError(
218 "Couldn't reach the pubchem servers, check your internet connection"
219 )
220 raise err from e
221 record = r.read().decode('utf-8')
222 record = json.loads(record)
223 return record['InformationList']['Information'][0]['ConformerID']
226def pubchem_search(*args, **kwargs) -> PubchemData:
227 """
228 Search PubChem for the field and search input on the argument passed in
229 returning a PubchemData object. Note that only one argument may be passed
230 in at a time.
232 Parameters:
233 name (str):
234 the common name of the compound you're searching for
235 cid (str or int):
236 the cid of the compound you're searching for
237 smiles (str):
238 the smiles string of the compound you're searching for
239 conformer (str or int):
240 the conformer id of the compound you're searching for
242 returns:
243 result (PubchemData):
244 a pubchem data object containing the information on the
245 requested entry
246 """
247 search, field = analyze_input(*args, **kwargs)
248 raw_pubchem = search_pubchem_raw(search, field)
249 atoms, data = parse_pubchem_raw(raw_pubchem)
250 return PubchemData(atoms, data)
253def pubchem_conformer_search(*args, **kwargs) -> list:
254 """
255 Search PubChem for all the conformers of a given compound.
256 Note that only one argument may be passed in at a time.
258 Parameters:
259 see `ase.data.pubchem.pubchem_search`
261 returns:
262 conformers (list):
263 a list containing the PubchemData objects of all the conformers
264 for your search
265 """
266 search, field = analyze_input(*args, **kwargs)
267 conformer_ids = available_conformer_search(search, field)
268 return [pubchem_search(conformer=id_) for id_ in conformer_ids]
271def pubchem_atoms_search(*args, **kwargs):
272 """
273 Search PubChem for the field and search input on the argument passed in
274 returning an atoms object.Note that only one argument may be passed
275 in at a time.
277 Parameters:
278 see `ase.data.pubchem.pubchem_search`
280 returns:
281 atoms (ASE Atoms Object):
282 an ASE Atoms object containing the information on the
283 requested entry
284 """
285 return pubchem_search(*args, **kwargs).get_atoms()
288def pubchem_atoms_conformer_search(*args, **kwargs):
289 """
290 Search PubChem for all the conformers of a given compound.
291 Note that only one argument may be passed in at a time.
293 Parameters:
294 see `ase.data.pubchem.pubchem_search`
296 returns:
297 conformers (list):
298 a list containing the atoms objects of all the conformers
299 for your search
300 """
301 conformers = pubchem_conformer_search(*args, **kwargs)
302 conformers = [conformer.get_atoms() for conformer in conformers]
303 return conformers