Source code for msp2db.re
#!/usr/bin/env python
from __future__ import absolute_import, unicode_literals, print_function
import collections
[docs]def get_meta_regex(schema='mona'):
""" Create a dictionary of regex for extracting the meta data for the spectra
"""
# NOTE: will just ignore cases, to avoid repetition here
meta_parse = collections.OrderedDict()
if schema == 'mona':
meta_parse['collision_energy'] = ['^collision energy(?:=|:)(.*)$']
meta_parse['ms_level'] = ['^ms.*level(?:=|:)\D*(\d*)$', '^ms type(?:=|:)\D*(\d*)$',
'^Spectrum_type(?:=|:)\D*(\d*)$']
meta_parse['accession'] = ['^accession(?:=|:)(.*)$', '^DB#(?:=|:)(.*)$']
meta_parse['resolution'] = ['^resolution(?:=|:)(.*)$']
meta_parse['polarity'] = ['^ion.*mode(?:=|:)(.*)$', '^ionization.*mode(?:=|:)(.*)$', '^polarity(?:=|:)(.*)$']
meta_parse['fragmentation_type'] = ['^fragmentation.*mode(?:=|:)(.*)$', '^fragmentation.*type(?:=|:)(.*)$']
meta_parse['precursor_mz'] = ['^precursor m/z(?:=|:)\s*(\d*[.,]?\d*)$', '^precursor.*mz(?:=|:)\s*(\d*[.,]?\d*)$']
meta_parse['precursor_type'] = ['^precursor.*type(?:=|:)(.*)$', '^adduct(?:=|:)(.*)$']
meta_parse['instrument_type'] = ['^instrument.*type(?:=|:)(.*)$']
meta_parse['instrument'] = ['^instrument(?:=|:)(.*)$']
meta_parse['copyright'] = ['^copyright(?:=|:)(.*)$']
# meta_parse['column'] = ['^column(?:=|:)(.*)$']
meta_parse['mass_accuracy'] = ['^mass.*accuracy(?:=|:)\s*(\d*[.,]?\d*)$']
meta_parse['mass_error'] = ['^mass.*error(?:=|:)\s*(\d*[.,]?\d*)$']
meta_parse['origin'] = ['^origin(?:=|:)(.*)$']
meta_parse['name'] = ['^Name(?:=|:)(.*)$']
meta_parse['splash'] = ['^splash:(.*)$']
meta_parse['retention_time'] = ['^retention.*time(?:=|:)\s*(\d*[.,]?\d*)$']
meta_parse['retention_index'] = ['^retention.*index(?:=|:)\s*(\d*[.,]?\d*)$']
elif schema == 'massbank':
meta_parse['collision_energy'] = ['^AC\$MASS_SPECTROMETRY:\s+COLLISION_ENERGY\s+(.*)$']
meta_parse['ms_level'] = ['^AC\$MASS_SPECTROMETRY:\s+MS_TYPE\s+\D*(\d*)$']
meta_parse['accession'] = ['^ACCESSION:(.*)$']
meta_parse['resolution'] = ['^AC\$MASS_SPECTROMETRY:\s+RESOLUTION\s+(.*)$']
meta_parse['polarity'] = ['^AC\$MASS_SPECTROMETRY:\s+ION_MODE\s+(.*)$']
meta_parse['fragmentation_type'] = ['^AC\$MASS_SPECTROMETRY:\s+FRAGMENTATION_MODE\s+(.*)$']
meta_parse['precursor_mz'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_M/Z\s+(\d*[.,]?\d*)$']
meta_parse['precursor_type'] = ['^MS\$FOCUSED_ION:\s+PRECURSOR_TYPE\s+(.*)$']
meta_parse['instrument_type'] = ['^AC\$INSTRUMENT_TYPE:\s+(.*)$']
meta_parse['instrument'] = ['^AC\$INSTRUMENT:\s+(.*)$']
meta_parse['copyright'] = ['^COPYRIGHT:\s+(.*)']
# meta_parse['column'] = ['^column(?:=|:)(.*)$']
meta_parse['mass_accuracy'] = ['^AC\$MASS_SPECTROMETRY:\s+ACCURACY\s+(.*)$'] # need to check
meta_parse['mass_error'] = ['^AC\$MASS_SPECTROMETRY:\s+ERROR\s+(.*)$'] # need to check
meta_parse['splash'] = ['^PK\$SPLASH:\s+(.*)$']
meta_parse['origin'] = ['^origin(?:=|:)(.*)$']
meta_parse['name'] = ['^RECORD_TITLE:\s+(.*)$']
meta_parse['retention_time'] = ['^AC\$CHROMATOGRAPHY:\s+RETENTION.*TIME\s+(\d*[.,]?\d*)$']
meta_parse['retention_index'] = ['^AC\$CHROMATOGRAPHY:\s+RETENTION.*INDEX\s+(\d*[.,]?\d*)$']
return meta_parse
[docs]def get_compound_regex(schema='mona'):
""" Create a dictionary of regex for extracting the compound information for the spectra
"""
# NOTE: will just ignore cases in the regex, to avoid repetition here
meta_parse = collections.OrderedDict()
if schema == 'mona':
meta_parse['name'] = ['^Name(?:=|:)(.*)$']
meta_parse['inchikey_id'] = ['^inchikey(?:=|:)(.*)$']
meta_parse['molecular_formula'] = ['^molecular formula(?:=|:)(.*)$', '^formula:(.*)$']
meta_parse['molecular_weight'] = ['^MW(?:=|:)(\d*[.,]?\d*)$']
meta_parse['pubchem_id'] = ['^pubchem.*cid(?:=|:)(\d*)".*$']
meta_parse['chemspider_id'] = ['^chemspider(?:=|:)(\d*)".*$']
meta_parse['compound_class'] = ['^compound.*class(?:=|:)(.*)$']
meta_parse['exact_mass'] = ['^exact.*mass(?:=|:)(\d*[.,]?\d*)$']
meta_parse['smiles'] = ['^SMILES(?:=|:)(.*)$']
meta_parse['other_names'] = ['^Synonym(?:=|:)(.*)$']
elif schema == 'massbank':
meta_parse['name'] = ['^CH\$NAME:\s+(.*)$']
meta_parse['other_names'] = ['^CH\$NAME:\s+(.*)$']
meta_parse['inchikey_id'] = ['^CH\$LINK:\s+INCHIKEY\s+(.*)$']
meta_parse['molecular_formula'] = ['^CH\$FORMULA:\s+(.*)$']
meta_parse['molecular_weight'] = ['^CH\$MOLECULAR_WEIGHT:\s+(.*)$']
meta_parse['pubchem_id'] = ['^CH\$LINK:\s+PUBCHEM\s+CID:(.*)$']
meta_parse['chemspider_id'] = ['^CH\$LINK:\s+CHEMSPIDER\s+(.*)$']
meta_parse['compound_class'] = ['^CH\$COMPOUND_CLASS:\s+(.*)$']
meta_parse['exact_mass'] = ['^CH\$EXACT_MASS:\s+(.*)$']
meta_parse['smiles'] = ['^CH\$SMILES:\s+(.*)$']
return meta_parse