Coverage for /builds/ase/ase/ase/io/cif_unicode.py: 88.89%
36 statements
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2025-08-02 00:12 +0000
1# fmt: off
3'''
4Conversion of text from a Crystallographic Information File (CIF) format to
5unicode. CIF text is neither unicode nor bibtex/latex code.
7Rules for character formatting in CIF files are specified at:
8https://www.iucr.org/resources/cif/spec/version1.1/semantics
9'''
11import html
12import re
14subs_dict = {
15 '\r': '', # Windows line ending
16 '\t': ' ', # tabs
18 r'\a': '\u03b1', # alpha
19 r'\b': '\u03b2', # beta
20 r'\g': '\u03b3', # gamma
21 r'\d': '\u03b4', # delta
22 r'\e': '\u03b5', # epsilon
23 r'\z': '\u03b6', # zeta
24 r'\h': '\u03b7', # eta
25 r'\q': '\u03b8', # theta
26 r'\i': '\u03b9', # iota
27 r'\k': '\u03ba', # kappa
28 r'\l': '\u03bb', # lambda
29 r'\m': '\u03bc', # mu
30 r'\n': '\u03bd', # nu
31 r'\x': '\u03be', # xi
32 r'\o': '\u03bf', # omicron
33 r'\p': '\u03c0', # pi
34 r'\r': '\u03c1', # rho
35 r'\s': '\u03c3', # sigma
36 r'\t': '\u03c4', # tau
37 r'\u': '\u03c5', # upsilon
38 r'\f': '\u03c6', # phi
39 r'\c': '\u03c7', # chi
40 r'\y': '\u03c8', # psi
41 r'\w': '\u03c9', # omega
42 r'\A': '\u0391', # Alpha
43 r'\B': '\u0392', # Beta
44 r'\G': '\u0393', # Gamma
45 r'\D': '\u0394', # Delta
46 r'\E': '\u0395', # Epsilon
47 r'\Z': '\u0396', # Zeta
48 r'\H': '\u0397', # Eta
49 r'\Q': '\u0398', # Theta
50 r'\I': '\u0399', # Ioto
51 r'\K': '\u039a', # Kappa
52 r'\L': '\u039b', # Lambda
53 r'\M': '\u039c', # Mu
54 r'\N': '\u039d', # Nu
55 r'\X': '\u039e', # Xi
56 r'\O': '\u039f', # Omicron
57 r'\P': '\u03a0', # Pi
58 r'\R': '\u03a1', # Rho
59 r'\S': '\u03a3', # Sigma
60 r'\T': '\u03a4', # Tau
61 r'\U': '\u03a5', # Upsilon
62 r'\F': '\u03a6', # Phi
63 r'\C': '\u03a7', # Chi
64 r'\Y': '\u03a8', # Psi
65 r'\W': '\u03a9', # Omega
67 r'\%a': '\u00e5', # a-ring
68 r'\/o': '\u00f8', # o-slash
69 r'\?i': '\u0131', # dotless i
70 r'\/l': '\u0142', # Polish l
71 r'\&s': '\u00df', # German eszett
72 r'\/d': '\u0111', # barred d
74 r'\%A': '\u00c5', # A-ring
75 r'\/O': '\u00d8', # O-slash
76 r'\?I': 'I', # dotless I
77 r'\/L': '\u0141', # Polish L
78 r'\&S': '\u1e9e', # German Eszett
79 r'\/D': '\u0110', # barred D
81 r'\%': '\u00b0', # degree
82 r'--': '\u2013', # dash
83 r'---': '\u2014', # single bond
84 r'\\db': '\u003d', # double bond
85 r'\\tb': '\u2261', # triple bond
86 r'\\ddb': '\u2248', # delocalized double bond
87 r'\\sim': '~',
88 r'\\simeq': '\u2243',
89 r'\\infty': '\u221e', # infinity
91 r'\\times': '\u00d7',
92 r'+-': '\u00b1', # plusminus
93 r'-+': '\u2213', # minusplus
94 r'\\square': '\u25a0',
95 r'\\neq': '\u2660',
96 r'\\rangle': '\u3009',
97 r'\\langle': '\u3008',
98 r'\\rightarrow': '\u2192',
99 r'\\leftarrow': '\u2190',
101 r"\'A": '\u00c1', # A acute
102 r"\'E": '\u00c9', # E acute
103 r"\'I": '\u00cd', # I acute
104 r"\'O": '\u00d3', # O acute
105 r"\'U": '\u00da', # U acute
106 r"\'Y": '\u00dd', # Y acute
107 r"\'a": '\u00e1', # a acute
108 r"\'e": '\u00e9', # e acute
109 r"\'i": '\u00ed', # i acute
110 r"\'o": '\u00f3', # o acute
111 r"\'u": '\u00fa', # u acute
112 r"\'y": '\u00fd', # y acute
113 r"\'C": '\u0106', # C acute
114 r"\'c": '\u0107', # c acute
115 r"\'L": '\u0139', # L acute
116 r"\'l": '\u013a', # l acute
117 r"\'N": '\u0143', # N acute
118 r"\'n": '\u0144', # n acute
119 r"\'R": '\u0154', # R acute
120 r"\'r": '\u0155', # r acute
121 r"\'S": '\u015a', # S acute
122 r"\'s": '\u015b', # s acute
123 r"\'Z": '\u0179', # Z acute
124 r"\'z": '\u017a', # z acute
125 r"\'G": '\u01f4', # G acute
126 r"\'g": '\u01f5', # g acute
127 r"\'K": '\u1e30', # K acute
128 r"\'k": '\u1e31', # k acute
129 r"\'M": '\u1e3e', # M acute
130 r"\'m": '\u1e3f', # m acute
131 r"\'P": '\u1e54', # P acute
132 r"\'p": '\u1e55', # p acute
133 r"\'W": '\u1e82', # W acute
134 r"\'w": '\u1e83', # w acute
135 r'\;A': '\u0104', # A ogonek
136 r'\;a': '\u0105', # a ogonek
137 r'\;E': '\u0118', # E ogonek
138 r'\;e': '\u0119', # e ogonek
139 r'\;I': '\u012e', # I ogonek
140 r'\;i': '\u012f', # i ogonek
141 r'\;U': '\u0172', # U ogonek
142 r'\;u': '\u0173', # u ogonek
143 r'\;O': '\u01ea', # O ogonek
144 r'\;o': '\u01eb', # o ogonek
145 r'\.C': '\u010a', # C dot above
146 r'\.c': '\u010b', # c dot above
147 r'\.E': '\u0116', # E dot above
148 r'\.e': '\u0117', # e dot above
149 r'\.G': '\u0120', # G dot above
150 r'\.g': '\u0121', # g dot above
151 r'\.I': '\u0130', # I dot above
152 r'\.Z': '\u017b', # Z dot above
153 r'\.z': '\u017c', # z dot above
154 r'\.A': '\u0226', # A dot above
155 r'\.a': '\u0227', # a dot above
156 r'\.O': '\u022e', # O dot above
157 r'\.o': '\u022f', # o dot above
158 r'\.B': '\u1e02', # B dot above
159 r'\.b': '\u1e03', # b dot above
160 r'\.D': '\u1e0a', # D dot above
161 r'\.d': '\u1e0b', # d dot above
162 r'\.F': '\u1e1e', # F dot above
163 r'\.f': '\u1e1f', # f dot above
164 r'\.H': '\u1e22', # H dot above
165 r'\.h': '\u1e23', # h dot above
166 r'\.M': '\u1e40', # M dot above
167 r'\.m': '\u1e41', # m dot above
168 r'\.N': '\u1e44', # N dot above
169 r'\.n': '\u1e45', # n dot above
170 r'\.P': '\u1e56', # P dot above
171 r'\.p': '\u1e57', # p dot above
172 r'\.R': '\u1e58', # R dot above
173 r'\.r': '\u1e59', # r dot above
174 r'\.S': '\u1e60', # S dot above
175 r'\.s': '\u1e61', # s dot above
176 r'\.T': '\u1e6a', # T dot above
177 r'\.t': '\u1e6b', # t dot above
178 r'\.W': '\u1e86', # W dot above
179 r'\.w': '\u1e87', # w dot above
180 r'\.X': '\u1e8a', # X dot above
181 r'\.x': '\u1e8b', # x dot above
182 r'\.Y': '\u1e8e', # Y dot above
183 r'\.y': '\u1e8f', # y dot above
184 r'\(A': '\u0102', # A breve
185 r'\(a': '\u0103', # a breve
186 r'\(E': '\u0114', # E breve
187 r'\(e': '\u0115', # e breve
188 r'\(G': '\u011e', # G breve
189 r'\(g': '\u011f', # g breve
190 r'\(I': '\u012c', # I breve
191 r'\(i': '\u012d', # i breve
192 r'\(O': '\u014e', # O breve
193 r'\(o': '\u014f', # o breve
194 r'\(U': '\u016c', # U breve
195 r'\(u': '\u016d', # u breve
196 r'\=A': '\u0100', # A macron
197 r'\=a': '\u0101', # a macron
198 r'\=E': '\u0112', # E macron
199 r'\=e': '\u0113', # e macron
200 r'\=I': '\u012a', # I macron
201 r'\=i': '\u012b', # i macron
202 r'\=O': '\u014c', # O macron
203 r'\=o': '\u014d', # o macron
204 r'\=U': '\u016a', # U macron
205 r'\=u': '\u016b', # u macron
206 r'\=Y': '\u0232', # Y macron
207 r'\=y': '\u0233', # y macron
208 r'\=G': '\u1e20', # G macron
209 r'\=g': '\u1e21', # g macron
210 r'\^A': '\u00c2', # A circumflex
211 r'\^E': '\u00ca', # E circumflex
212 r'\^I': '\u00ce', # I circumflex
213 r'\^O': '\u00d4', # O circumflex
214 r'\^U': '\u00db', # U circumflex
215 r'\^a': '\u00e2', # a circumflex
216 r'\^e': '\u00ea', # e circumflex
217 r'\^i': '\u00ee', # i circumflex
218 r'\^o': '\u00f4', # o circumflex
219 r'\^u': '\u00fb', # u circumflex
220 r'\^C': '\u0108', # C circumflex
221 r'\^c': '\u0109', # c circumflex
222 r'\^G': '\u011c', # G circumflex
223 r'\^g': '\u011d', # g circumflex
224 r'\^H': '\u0124', # H circumflex
225 r'\^h': '\u0125', # h circumflex
226 r'\^J': '\u0134', # J circumflex
227 r'\^j': '\u0135', # j circumflex
228 r'\^S': '\u015c', # S circumflex
229 r'\^s': '\u015d', # s circumflex
230 r'\^W': '\u0174', # W circumflex
231 r'\^w': '\u0175', # w circumflex
232 r'\^Y': '\u0176', # Y circumflex
233 r'\^y': '\u0177', # y circumflex
234 r'\^Z': '\u1e90', # Z circumflex
235 r'\^z': '\u1e91', # z circumflex
236 r'\"A': '\u00c4', # A diaeresis
237 r'\"E': '\u00cb', # E diaeresis
238 r'\"I': '\u00cf', # I diaeresis
239 r'\"O': '\u00d6', # O diaeresis
240 r'\"U': '\u00dc', # U diaeresis
241 r'\"a': '\u00e4', # a diaeresis
242 r'\"e': '\u00eb', # e diaeresis
243 r'\"i': '\u00ef', # i diaeresis
244 r'\"o': '\u00f6', # o diaeresis
245 r'\"u': '\u00fc', # u diaeresis
246 r'\"y': '\u00ff', # y diaeresis
247 r'\"Y': '\u0178', # Y diaeresis
248 r'\"H': '\u1e26', # H diaeresis
249 r'\"h': '\u1e27', # h diaeresis
250 r'\"W': '\u1e84', # W diaeresis
251 r'\"w': '\u1e85', # w diaeresis
252 r'\"X': '\u1e8c', # X diaeresis
253 r'\"x': '\u1e8d', # x diaeresis
254 r'\"t': '\u1e97', # t diaeresis
255 r'\~A': '\u00c3', # A tilde
256 r'\~N': '\u00d1', # N tilde
257 r'\~O': '\u00d5', # O tilde
258 r'\~a': '\u00e3', # a tilde
259 r'\~n': '\u00f1', # n tilde
260 r'\~o': '\u00f5', # o tilde
261 r'\~I': '\u0128', # I tilde
262 r'\~i': '\u0129', # i tilde
263 r'\~U': '\u0168', # U tilde
264 r'\~u': '\u0169', # u tilde
265 r'\~V': '\u1e7c', # V tilde
266 r'\~v': '\u1e7d', # v tilde
267 r'\~E': '\u1ebc', # E tilde
268 r'\~e': '\u1ebd', # e tilde
269 r'\~Y': '\u1ef8', # Y tilde
270 r'\~y': '\u1ef9', # y tilde
271 r'\<C': '\u010c', # C caron
272 r'\<c': '\u010d', # c caron
273 r'\<D': '\u010e', # D caron
274 r'\<d': '\u010f', # d caron
275 r'\<E': '\u011a', # E caron
276 r'\<e': '\u011b', # e caron
277 r'\<L': '\u013d', # L caron
278 r'\<l': '\u013e', # l caron
279 r'\<N': '\u0147', # N caron
280 r'\<n': '\u0148', # n caron
281 r'\<R': '\u0158', # R caron
282 r'\<r': '\u0159', # r caron
283 r'\<S': '\u0160', # S caron
284 r'\<s': '\u0161', # s caron
285 r'\<T': '\u0164', # T caron
286 r'\<t': '\u0165', # t caron
287 r'\<Z': '\u017d', # Z caron
288 r'\<z': '\u017e', # z caron
289 r'\<A': '\u01cd', # A caron
290 r'\<a': '\u01ce', # a caron
291 r'\<I': '\u01cf', # I caron
292 r'\<i': '\u01d0', # i caron
293 r'\<O': '\u01d1', # O caron
294 r'\<o': '\u01d2', # o caron
295 r'\<U': '\u01d3', # U caron
296 r'\<u': '\u01d4', # u caron
297 r'\<G': '\u01e6', # G caron
298 r'\<g': '\u01e7', # g caron
299 r'\<K': '\u01e8', # K caron
300 r'\<k': '\u01e9', # k caron
301 r'\<j': '\u01f0', # j caron
302 r'\<H': '\u021e', # H caron
303 r'\<h': '\u021f', # h caron
304 r'\>O': '\u0150', # O double acute
305 r'\>o': '\u0151', # o double acute
306 r'\>U': '\u0170', # U double acute
307 r'\>u': '\u0171', # u double acute
308 r'\,C': '\u00c7', # C cedilla
309 r'\,c': '\u00e7', # c cedilla
310 r'\,G': '\u0122', # G cedilla
311 r'\,g': '\u0123', # g cedilla
312 r'\,K': '\u0136', # K cedilla
313 r'\,k': '\u0137', # k cedilla
314 r'\,L': '\u013b', # L cedilla
315 r'\,l': '\u013c', # l cedilla
316 r'\,N': '\u0145', # N cedilla
317 r'\,n': '\u0146', # n cedilla
318 r'\,R': '\u0156', # R cedilla
319 r'\,r': '\u0157', # r cedilla
320 r'\,S': '\u015e', # S cedilla
321 r'\,s': '\u015f', # s cedilla
322 r'\,T': '\u0162', # T cedilla
323 r'\,t': '\u0163', # t cedilla
324 r'\,E': '\u0228', # E cedilla
325 r'\,e': '\u0229', # e cedilla
326 r'\,D': '\u1e10', # D cedilla
327 r'\,d': '\u1e11', # d cedilla
328 r'\,H': '\u1e28', # H cedilla
329 r'\,h': '\u1e29', # h cedilla
330 r'\`A': '\u00c0', # A grave
331 r'\`E': '\u00c8', # E grave
332 r'\`I': '\u00cc', # I grave
333 r'\`O': '\u00d2', # O grave
334 r'\`U': '\u00d9', # U grave
335 r'\`a': '\u00e0', # a grave
336 r'\`e': '\u00e8', # e grave
337 r'\`i': '\u00ec', # i grave
338 r'\`o': '\u00f2', # o grave
339 r'\`u': '\u00f9', # u grave
340 r'\`N': '\u01f8', # N grave
341 r'\`n': '\u01f9', # n grave
342 r'\`W': '\u1e80', # W grave
343 r'\`w': '\u1e81', # w grave
344 r'\`Y': '\u1ef2', # Y grave
345 r'\`y': '\u1ef3', # y grave
346}
348superscript_dict = {
349 '0': '\u2070', # superscript 0
350 '1': '\u00b9', # superscript 1
351 '2': '\u00b2', # superscript 2
352 '3': '\u00b3', # superscript 3
353 '4': '\u2074', # superscript 4
354 '5': '\u2075', # superscript 5
355 '6': '\u2076', # superscript 6
356 '7': '\u2077', # superscript 7
357 '8': '\u2078', # superscript 8
358 '9': '\u2079', # superscript 9
359}
361subscript_dict = {
362 '0': '\u2080', # subscript 0
363 '1': '\u2081', # subscript 1
364 '2': '\u2082', # subscript 2
365 '3': '\u2083', # subscript 3
366 '4': '\u2084', # subscript 4
367 '5': '\u2085', # subscript 5
368 '6': '\u2086', # subscript 6
369 '7': '\u2087', # subscript 7
370 '8': '\u2088', # subscript 8
371 '9': '\u2089', # subscript 9
372}
375def replace_subscript(s: str, subscript=True) -> str:
377 target = '~'
378 rdict = subscript_dict
379 if not subscript:
380 target = '^'
381 rdict = superscript_dict
383 replaced = []
384 inside = False
385 for char in s:
386 if char == target:
387 inside = not inside
388 elif not inside:
389 replaced += [char]
390 # note: do not use char.isdigit - this also matches (sub/super)scripts
391 elif char in rdict:
392 replaced += [rdict[char]]
393 else:
394 replaced += [char]
396 return ''.join(replaced)
399def multiple_replace(text: str, adict) -> str:
400 rx = re.compile('|'.join(map(re.escape, adict)))
402 def one_xlat(match):
403 return adict[match.group(0)]
405 return rx.sub(one_xlat, text)
408def format_unicode(s: str) -> str:
409 """Converts a string in CIF text-format to unicode. Any HTML tags
410 contained in the string are removed. HTML numeric character references
411 are unescaped (i.e. converted to unicode).
413 Parameters:
415 s: string
416 The CIF text string to convert
418 Returns:
420 u: string
421 A unicode formatted string.
422 """
424 s = html.unescape(s)
425 s = multiple_replace(s, subs_dict)
426 tagclean = re.compile('<.*?>')
427 return re.sub(tagclean, '', s)
430def handle_subscripts(s: str) -> str:
431 s = replace_subscript(s, subscript=True)
432 s = replace_subscript(s, subscript=False)
433 return s