Source code for src.graphdb_builder.databases.parsers.foodbParser

import os.path
import tarfile
from collections import defaultdict
from graphdb_builder import mapping as mp, builder_utils
import pandas as pd

###################
#       FooDB     # 
###################


[docs]def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "FooDB")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases')

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    tar_fileName = os.path.join(directory, database_url.split('/')[-1])
    if download:
        builder_utils.downloadDB(database_url, directory)

    contents = {}
    food = set()
    compounds = {}
    try:
        tf = tarfile.open(tar_fileName, 'r')
        file_content = tf.getnames()
        tar_dir = file_content[1]
        tf.extractall(path=directory)
        tf.close()
        for file_name in config['files']:
            path = os.path.join(directory, os.path.join(tar_dir, file_name))
            with open(path, 'r', encoding="utf-8", errors='replace') as f:
                if file_name == "Content.csv":
                    contents = parseContents(f)
                elif file_name == "Food.csv":
                    food, mapping = parseFood(f)
                elif file_name == "Compound.csv":
                    compounds = parseCompounds(f)
        for food_id, compound_id in contents:
            if compound_id in compounds:
                compound_code = compounds[compound_id].replace("HMDB", "HMDB00")
                relationships[("food", "has_content")].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)])
        mp.reset_mapping(entity="Food")
        with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out:
            for food_id in mapping:
                for alias in mapping[food_id]:
                    out.write(str(food_id)+"\t"+str(alias)+"\n")

        mp.mark_complete_mapping(entity="Food")
    except tarfile.ReadError as err:
        raise Exception("Error importing database FooDB.\n {}".format(err))

    builder_utils.remove_directory(directory)

    return food, relationships, entities_header, relationships_headers


[docs]def parseContents(fhandler):
    contents = {}
    first = True
    for line in fhandler:
        if first:
            first = False
            continue
        data = line.rstrip("\r\n").split(",")
        if len(data) == 24:
            compound_id = data[0]
            food_id = int(data[3])
            min_cont = float(data[11]) if data[11] != 'NULL' else 0 
            max_cont = float(data[12]) if data[12] != 'NULL' else 0
            units = data[13].replace('"', '')
            average = float(data[23]) if data[23] != 'NULL' else 0
            contents[(food_id, compound_id)] = (min_cont, max_cont, average, units, "FooDB")
    return contents


[docs]def parseFood(fhandler):
    food = set()
    mapping = defaultdict(set)
    df = pd.read_csv(fhandler, sep=',', header=None, error_bad_lines=False, low_memory=False, encoding="utf-8")
    first = True
    for index, row in df.iterrows():
        if first:
            first = False
            continue
        food_id = row[22]
        name = row[1]
        sci_name = row[2]
        description = str(row[3]).replace('"', '')
        group = row[11]
        subgroup = row[12]
        food.add((food_id, name, sci_name, description, group, subgroup, "FooDB"))
        mapping[food_id].add(name)
        mapping[food_id].add(sci_name)

    return food, mapping


[docs]def parseCompounds(fhandler):
    compounds = {}
    first = True
    df = pd.read_csv(fhandler, sep=',', header=None, error_bad_lines=False, low_memory=False, encoding="utf-8")
    first = True
    for index, row in df.iterrows():
        if first:
            first = False
            continue
        print(row)
        print(row.shape)
        compound_id = row[0]
        mapped_code = row[44]
        if str(mapped_code) != 'nan':
            compounds[compound_id] = mapped_code
    return compounds


if __name__ == "__main__":
    pass