Source code for src.graphdb_builder.databases.parsers.exposomeParser

import os.path
import zipfile
from graphdb_builder import mapping as mp, builder_utils
from collections import defaultdict
import pandas as pd

###############################
#       Exposome Explorer     # 
###############################
[docs]def parser(databases_directory, download=True): directory = os.path.join(databases_directory, "ExposomeExplorer") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases') database_urls = config['database_urls'] relationships_header = config['relationships_header'] mapping = mp.getMappingForEntity("Food") correlations = {} for url in database_urls: zipped_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipped_fileName) as z: if file_name == "biomarkers.csv": biomarkers = parseBiomarkersFile(z, file_name) elif file_name == "correlations.csv": correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping) builder_utils.remove_directory(directory) return correlations, relationships_header
[docs]def parseBiomarkersFile(fhandler, file_name): biomarkers = {} first = True with fhandler.open(file_name) as f: df = pd.read_csv(f, sep=',', header=None, error_bad_lines=False, low_memory=False) first = True for index, row in df.iterrows(): if first: first = False continue identifier = row[0] metabolite = row[10] if metabolite != '': biomarkers[identifier] = metabolite return biomarkers
[docs]def parseCorrelationsFile(fhandler, file_name, biomarkers, mapping): correlations = defaultdict(set) first = True with fhandler.open(file_name) as f: df = pd.read_csv(f, sep=',', header=None, error_bad_lines=False, low_memory=False) first = True for index, row in df.iterrows(): if first: first = False continue biomarker = row[0] food_name = row[9] intake_median = row[14] intake_units = row[15] biosample = row[18] method = row[19] #corr_method = row[29] corr = float(row[29]) ci_low = row[30] ci_high = row[31] pvalue = row[32] significant = row[33] publication = row[37] if significant in ["Yes","yes","YES", "Y"]: if food_name in mapping: food_id = mapping[food_name] if biomarker in biomarkers: biomarker_id = biomarkers[biomarker] correlations[("food", "correlated_with_metabolite")].add((food_id, biomarker_id, "CORRELATED_WITH_METABOLITE",intake_median, intake_units, biosample, method, corr, ci_low, ci_high, pvalue, significant, publication, "Exposome Explorer" )) return correlations
if __name__ == "__main__": pass