Source code for src.graphdb_builder.experiments.parsers.proteomicsParser
import os
import re
import pandas as pd
import numpy as np
from collections import defaultdict
from graphdb_builder import builder_utils, mapping
[docs]def parser(projectId, type='proteomics', directory=None):
#directory = None
data = {}
cwd = os.path.abspath(os.path.dirname(__file__))
config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments')
if directory is None:
directory = os.path.join(cwd, '../../../../data/experiments/PROJECTID/' + type)
if 'directory' in config:
directory = os.path.join(cwd, config['directory'] + type)
directory = directory.replace('PROJECTID', projectId)
data = parse_from_directory(projectId, directory, config)
return data
[docs]def parse_from_directory(projectId, directory, configuration={}):
data = {}
processing_results = [x[0] for x in os.walk(directory)]
for results_path in processing_results:
processing_tool = os.path.basename(os.path.normpath(results_path))
if processing_tool in configuration:
sample_mapping = mapping.get_mapping_analytical_samples(projectId)
if len(sample_mapping) > 0:
mapping.map_experiment_files(projectId, os.path.join(directory, processing_tool), sample_mapping)
tool_configuration = configuration[processing_tool]
for dtype in tool_configuration:
dataset_configuration = tool_configuration[dtype]
missing_conf = check_minimum_configuration(dataset_configuration)
if len(missing_conf) == 0:
dfile_regex = re.compile(dataset_configuration['file'])
filepath = ''
for dir_content in os.walk(results_path):
for f in dir_content[2]:
if dfile_regex.match(f):
filepath = os.path.join(results_path, f)
break
data.update(parser_from_file(file_path=filepath, configuration=dataset_configuration, data_type=dtype))
else:
raise Exception("Error when importing proteomics experiment.\n Missing configuration: {}".format(",".join(missing_conf)))
return data
[docs]def parser_from_file(file_path, configuration, data_type, is_standard=True):
data = {}
if is_standard:
df = parse_standard_dataset(file_path, configuration)
else:
df = parse_dataset(file_path, configuration)
if df is not None and not df.empty:
if data_type == "proteins":
data[(data_type, 'w')] = extract_protein_subject_rels(df, configuration)
elif data_type == "peptides":
data[('subject_peptide', 'w')] = extract_peptide_subject_rels(df, configuration)
data[('peptide_protein', 'w')] = extract_peptide_protein_rels(df, configuration)
data[(data_type, 'w')] = extract_peptides(df, configuration)
else:
data[('modifiedprotein_subject', 'a')] = extract_protein_modification_subject_rels(df, configuration)
data[('modifiedprotein_protein', 'a')] = extract_protein_protein_modification_rels(df, configuration)
data[('modifiedprotein_peptide', 'a')] = extract_peptide_protein_modification_rels(df, configuration)
data[('modifiedprotein', 'a')] = extract_protein_modifications_rels(df, configuration)
data[('modifiedprotein_modification', 'a')] = extract_protein_modifications_modification_rels(df, configuration)
return data
[docs]def get_configuration(processing_tool, data_type):
configuration = None
if processing_tool is not None:
config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments')
if processing_tool in config:
tool_configuration = config[processing_tool]
if data_type in tool_configuration:
configuration = tool_configuration[data_type]
return configuration
[docs]def update_configuration(data_type, processing_tool, value_col='LFQ intensity', columns=[], drop_cols=[], filters=None, new_config={}):
configuration = get_configuration(processing_tool, data_type)
if configuration is not None:
configuration['columns'].extend(columns)
configuration['valueCol'] = value_col
if len(drop_cols) > 0:
configuration['columns'] = [c for c in configuration['columns'] if c not in drop_cols]
if 'attributes' in configuration:
if 'cols' in configuration['attributes']:
configuration['attributes']['cols'] = [c for c in configuration['attributes']['cols'] if c not in drop_cols]
if 'regex' in configuration['attributes']:
configuration['attributes']['regex'] = [c for c in configuration['attributes']['regex'] if c not in drop_cols]
if filters is not None:
configuration['filters'] = filters
for key in new_config:
configuration[key] = new_config[key]
return configuration
[docs]def parse_dataset(filepath, configuration):
data = None
if os.path.isfile(filepath):
data, regex = load_dataset(filepath, configuration)
if data is not None:
if 'log' in configuration:
log = configuration['log']
cols = get_value_cols(data, configuration)
if log == 'log2':
data[cols] = np.log2(data[cols]).replace([np.inf, -np.inf], np.nan)
elif log == 'log10':
data[cols] = np.log10(data[cols]).replace([np.inf, -np.inf], np.nan)
return data
[docs]def parse_standard_dataset(file_path, configuration):
dataset = None
if os.path.isfile(file_path):
data, regex = load_dataset(file_path, configuration)
if data is not None:
log = configuration['log']
combine = 'regex'
if 'combine' in configuration:
combine = configuration['combine']
if combine == 'valueCol':
value_cols = get_value_cols(data, configuration)
subjectDict = extract_subject_replicates(data, value_cols)
else:
subjectDict = extract_subject_replicates_from_regex(data, regex)
delCols = []
for subject in subjectDict:
delCols.extend(subjectDict[subject])
aux = data[subjectDict[subject]]
data[subject] = calculate_median_replicates(aux, log)
dataset = data.drop(delCols, 1)
dataset = dataset.dropna(how='all')
return dataset
[docs]def check_columns(data, req_columns, generated_columns):
return set(req_columns).difference(set(data.columns)).difference(generated_columns)
[docs]def check_minimum_configuration(configuration):
minimum_req = ['columns', 'indexCol',
'proteinCol', 'log',
'file', 'valueCol', 'attributes']
return set(minimum_req).difference(set(configuration.keys()))
[docs]def load_dataset(uri, configuration):
''' This function gets the molecular data from a proteomics experiment.
Input: uri of the processed file resulting from MQ
Output: pandas DataFrame with the columns and filters defined in config.py '''
data = None
regexCols = None
filters = None
columns = configuration["columns"]
regexCols = [c.replace("\\\\", "\\") for c in columns if '+' in c]
columns = set(columns).difference(regexCols)
generated_columns = []
if 'generated_columns' in configuration:
generated_columns = configuration['generated_columns']
if 'filters' in configuration:
filters = configuration["filters"]
indexCol = configuration["indexCol"]
data = builder_utils.readDataset(uri)
missing_cols = check_columns(data, columns, generated_columns)
if len(missing_cols) == 0:
if filters is not None:
data = data[data[filters].isnull().all(1)]
data = data.drop(filters, axis=1)
columns = set(columns).difference(filters)
if 'numeric filter' in configuration:
for f in configuration['numeric filter']:
key = list(f.keys())[0]
if key in columns:
value = f[key]
data = data[data[key] >= value]
else:
raise Exception("Error when applying numeric filter on {}. The column is not in the dataset".format(f))
data = data.dropna(subset=[configuration["proteinCol"]], axis=0)
data = expand_groups(data, configuration)
columns.remove(indexCol)
for regex in regexCols:
r = re.compile(regex)
columns.update(set(filter(r.match, data.columns)))
data = data[list(columns)].replace('Filtered', np.nan)
value_cols = get_value_cols(data, configuration)
data[value_cols] = data[value_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
data = data.dropna(how='all', subset=value_cols, axis=0)
else:
raise Exception("Error when importing proteomics experiment.\n Missing columns: {}".format(",".join(missing_cols)))
return data, regexCols
[docs]def remove_contaminant_tag(column, tag='CON__'):
new_column = [c.replace(tag, '') for c in column]
return new_column
[docs]def expand_groups(data, configuration):
default_group_col = 'id'
if "groupCol" not in configuration or configuration["groupCol"] is None:
data.index.name = default_group_col
data = data.reset_index()
configuration['groupCol'] = default_group_col
elif configuration['groupCol'] not in data.columns:
data.index.name = configuration['groupCol']
data = data.reset_index()
s = data[configuration["proteinCol"]].str.split(';').apply(pd.Series, 1).stack().reset_index(level=1, drop=True)
del data[configuration["proteinCol"]]
pdf = s.to_frame(configuration["proteinCol"])
if "multipositions" in configuration:
s2 = data[configuration["multipositions"]].str.split(';').apply(pd.Series, 1).stack().reset_index(level=1, drop=True)
del data[configuration["multipositions"]]
pdf = pd.concat([s, s2], axis=1, keys=[configuration["proteinCol"], configuration["multipositions"]])
data = data.join(pdf)
if 'contaminant_tag' in configuration:
data[configuration["proteinCol"]] = remove_contaminant_tag(column=data[configuration["proteinCol"]].tolist(), tag=configuration['contaminant_tag'])
data["is_razor"] = ~ data[configuration["groupCol"]].duplicated()
data = data.set_index(configuration["indexCol"])
return data
############## ProteinModification entity ####################
[docs]def extract_modification_protein_rels(data, configuration):
modificationId = configuration["modId"]
cols = configuration["positionCols"]
aux = data[cols]
aux = aux.reset_index()
aux.columns = ["START_ID", "position", "residue"]
aux["END_ID"] = modificationId
aux['TYPE'] = "HAS_MODIFICATION"
aux = aux[['START_ID', 'END_ID', 'TYPE', "position", "residue"]]
aux['position'] = aux['position'].astype('int64')
aux = aux.drop_duplicates()
return aux
[docs]def extract_protein_modification_subject_rels(data, configuration):
positionCols = configuration["positionCols"]
proteinCol = configuration["proteinCol"]
cols = [proteinCol]
cols.extend(positionCols)
data = data.reset_index()
data["END_ID"] = data[proteinCol].map(str) + "_" + data[positionCols[1]].map(str) + data[positionCols[0]].map(str) + '-' + configuration["mod_acronym"]
data = data.set_index("END_ID")
newIndexdf = data.copy()
data = data.drop(cols, axis=1)
data = data.filter(regex=configuration["valueCol"].replace("\\\\", "\\"))
data.columns = [c.split(" ")[1] for c in data.columns]
data = data.stack()
data = data.reset_index()
data.columns = ["c"+str(i) for i in range(len(data.columns))]
columns = ['END_ID', 'START_ID', "value"]
attributes = configuration["attributes"]
(cAttributes, cCols), (rAttributes, regexCols) = extract_attributes(newIndexdf, attributes)
if not rAttributes.empty:
data = merge_regex_attributes(data, rAttributes, ["c0", "c1"], regexCols)
columns.extend(regexCols)
if not cAttributes.empty:
data = merge_col_attributes(data, cAttributes, "c0")
columns.extend(cCols)
data['TYPE'] = "HAS_QUANTIFIED_MODIFIED_PROTEIN"
columns.append("TYPE")
data.columns = columns
data = data[['START_ID', 'END_ID', 'TYPE', "value"] + regexCols + cCols]
data.columns = [c.replace('PG.', '') for c in data.columns]
data = data.drop_duplicates()
return data
[docs]def extract_protein_protein_modification_rels(data, configuration):
positionCols = configuration["positionCols"]
proteinCol = configuration["proteinCol"]
cols = [proteinCol]
cols.extend(positionCols)
aux = data.copy().reset_index()
aux = aux[cols]
aux["END_ID"] = aux[proteinCol].map(str) + "_" + aux[positionCols[1]].map(str) + aux[positionCols[0]].map(str)+'-'+configuration["mod_acronym"]
aux = aux.drop(positionCols, axis=1)
aux = aux.set_index("END_ID")
aux = aux.reset_index()
aux.columns = ["END_ID", "START_ID"]
aux['TYPE'] = "HAS_MODIFIED_SITE"
aux = aux[['START_ID', 'END_ID', 'TYPE']]
aux = aux.drop_duplicates()
return aux
[docs]def extract_peptide_protein_modification_rels(data, configuration):
positionCols = configuration["positionCols"]
proteinCol = configuration["proteinCol"]
sequenceCol = configuration["sequenceCol"]
cols = [sequenceCol, proteinCol]
cols.extend(positionCols)
aux = data.copy().reset_index()
aux = aux[cols]
aux["END_ID"] = aux[proteinCol].map(str) + "_" + aux[positionCols[1]].map(str) + aux[positionCols[0]].map(str)+'-'+configuration["mod_acronym"]
aux = aux.drop([proteinCol] + positionCols, axis=1)
aux = aux.set_index("END_ID")
aux = aux.reset_index()
aux.columns = ["END_ID", "START_ID"]
aux["START_ID"] = aux["START_ID"].str.upper()
aux['TYPE'] = "HAS_MODIFIED_SITE"
aux = aux[['START_ID', 'END_ID', 'TYPE']]
aux = aux.drop_duplicates()
return aux
[docs]def extract_protein_modifications_rels(data, configuration):
positionCols = configuration["positionCols"]
proteinCol = configuration["proteinCol"]
sequenceCol = configuration["sequenceCol"]
cols = [proteinCol, sequenceCol]
cols.extend(positionCols)
aux = data.copy().reset_index()
aux = aux[cols]
aux["ID"] = aux[proteinCol].map(str) + "_" + aux[positionCols[1]].map(str) + aux[positionCols[0]].map(str)+'-'+configuration["mod_acronym"]
aux = aux.set_index("ID")
aux = aux.reset_index()
aux[sequenceCol] = aux[sequenceCol].str.replace('_', '-')
aux["source"] = "experimentally_identified"
aux.columns = ["ID", "protein", "sequence_window", "position", "residue", "source"]
aux = aux.drop_duplicates()
return aux
[docs]def extract_protein_modifications_modification_rels(data, configuration):
modID = configuration["modId"]
positionCols = configuration["positionCols"]
proteinCol = configuration["proteinCol"]
sequenceCol = configuration["sequenceCol"]
cols = [proteinCol, sequenceCol]
cols.extend(positionCols)
aux = data.copy().reset_index()
aux = aux[cols]
aux["START_ID"] = aux[proteinCol].map(str) + "_" + aux[positionCols[1]].map(str) + aux[positionCols[0]].map(str)+'-'+configuration["mod_acronym"]
aux["END_ID"] = modID
aux = aux[["START_ID", "END_ID"]]
return aux
################# Peptide entity ####################
[docs]def extract_peptides(data, configuration):
aux = data.copy()
modid = configuration["type"]
aux["type"] = modid
aux = aux["type"]
aux = aux.reset_index()
aux = aux.groupby(aux.columns.tolist()).size().reset_index().rename(columns={0: 'count'})
aux.columns = ["ID", "type", "count"]
aux = aux.drop_duplicates()
return aux
[docs]def extract_peptide_subject_rels(data, configuration):
data = data[~data.index.duplicated(keep='first')]
aux = data.filter(regex=configuration["valueCol"].replace("\\\\", "\\"))
attributes = configuration["attributes"]
aux.columns = [c.split(" ")[1] for c in aux.columns]
aux = aux.stack()
aux = aux.reset_index()
aux.columns = ["c"+str(i) for i in range(len(aux.columns))]
columns = ['END_ID', 'START_ID', "value"]
(cAttributes, cCols), (rAttributes, regexCols) = extract_attributes(data, attributes)
if not rAttributes.empty:
aux = merge_regex_attributes(aux, rAttributes, ["c0", "c1"], regexCols)
columns.extend(regexCols)
if not cAttributes.empty:
aux = merge_col_attributes(aux, cAttributes, "c0")
columns.extend(cCols)
aux['TYPE'] = "HAS_QUANTIFIED_PEPTIDE"
columns.append("TYPE")
aux.columns = columns
aux = aux[['START_ID', 'END_ID', 'TYPE', "value"] + regexCols + cCols]
aux.columns = [c.replace('PG.', '') for c in aux.columns]
aux = aux.drop_duplicates()
return aux
[docs]def extract_peptide_protein_rels(data, configuration):
cols = [configuration["proteinCol"]]
cols.extend(configuration["positionCols"])
aux = data[cols]
aux = aux.reset_index()
aux.columns = ["Sequence", "Protein", "Start", "End"]
aux['TYPE'] = "BELONGS_TO_PROTEIN"
aux['source'] = 'experimentally_identified'
aux.columns = ['START_ID', 'END_ID', "start", "end", 'TYPE', 'source']
aux = aux[['START_ID', 'END_ID', 'TYPE', 'source']]
return aux
[docs]def extract_protein_subject_rels(data, configuration):
aux = data.filter(regex=configuration["valueCol"])
attributes = configuration["attributes"]
if configuration["valueCol"] != 'AS':
aux.columns = [re.sub("\.?" + configuration["valueCol"] + "\s?", '', c).strip() for c in aux.columns]
else:
aux.columns = [c.strip() for c in aux.columns]
aux = aux.stack()
aux = aux.reset_index()
aux.columns = ["c"+str(i) for i in range(len(aux.columns))]
columns = ['END_ID', 'START_ID', "value"]
(cAttributes, cCols), (rAttributes, regexCols) = extract_attributes(data, attributes)
if not rAttributes.empty:
aux = merge_regex_attributes(aux, rAttributes, ["c0", "c1"], regexCols)
columns.extend(regexCols)
if not cAttributes.empty:
aux = merge_col_attributes(aux, cAttributes, "c0")
columns.extend(cCols)
aux['TYPE'] = "HAS_QUANTIFIED_PROTEIN"
columns.append("TYPE")
aux.columns = columns
aux = aux[['START_ID', 'END_ID', 'TYPE', "value"] + regexCols + cCols]
aux.columns = [c.replace('PG.', '') for c in aux.columns]
return aux
[docs]def get_value_cols(data, configuration):
value_cols = []
if 'valueCol' in configuration:
value_cols = [c for c in data.columns if configuration['valueCol'] in c]
return value_cols
[docs]def extract_subject_replicates_from_regex(data, regex):
subjectDict = defaultdict(list)
for r in regex:
columns = data.filter(regex=r).columns
for c in columns:
value = ""
timepoint = ""
fields = c.split('_')
if len(fields) > 1:
value = " ".join(fields[0].split(' ')[0:-1])
subject = fields[1]
if len(fields) > 2:
timepoint = " " + fields[2]
else:
subject = fields[0]
ident = value + " " + subject + timepoint
subjectDict[ident].append(c)
return subjectDict
[docs]def extract_subject_replicates(data, value_cols):
subjectDict = defaultdict(list)
for c in value_cols:
value = ""
timepoint = ""
fields = c.split('_')
if len(fields) > 1:
value = " ".join(fields[0].split(' ')[0:-1])
subject = fields[1]
if len(fields) > 2:
timepoint = " " + fields[2]
else:
subject = fields[0]
ident = value + " " + subject + timepoint
subjectDict[ident].append(c)
return subjectDict
[docs]def extract_attributes(data, attributes):
auxAttr_col = pd.DataFrame(index=data.index)
auxAttr_reg = pd.DataFrame(index=data.index)
cCols = []
regexCols = []
for ctype in attributes:
if ctype == "regex":
for r in attributes[ctype]:
attr_col = data.filter(regex=r)
if not attr_col.empty:
regexCols.append(r)
auxAttr_reg = auxAttr_reg.join(attr_col)
else:
auxAttr_col = auxAttr_col.join(data[attributes[ctype]])
cCols = [c.replace(' ', '_').replace('-', '') for c in attributes[ctype]]
reg_attr_index = auxAttr_reg.index.name
col_attr_index = auxAttr_col.index.name
auxAttr_reg = auxAttr_reg.reset_index().drop_duplicates().set_index(reg_attr_index)
auxAttr_col = auxAttr_col.reset_index().drop_duplicates().set_index(col_attr_index)
return (auxAttr_col, cCols), (auxAttr_reg, regexCols)
[docs]def merge_regex_attributes(data, attributes, index, regexCols):
data = data.sort_values(by=index)
data = data.set_index(index)
if not attributes.empty:
for rc in regexCols:
attr_aux = attributes.filter(regex=rc)
columns = [re.sub("\.?" + rc + "\s?", '', c).strip() for c in attr_aux.columns]
attr_aux.columns = columns
attr_aux = attr_aux.stack()
attr_aux = attr_aux.reset_index()
attr_aux.columns = ["c" + str(i) for i in range(len(attr_aux.columns))]
attr_aux = attr_aux.sort_values(by=index)
data = data.join(attr_aux.set_index(index), rsuffix='test')
del(attr_aux)
del(attributes)
data = data.reset_index()
return data
[docs]def merge_col_attributes(data, attributes, index):
if not attributes.empty:
data = data.set_index(index)
data = data.join(attributes)
del(attributes)
data = data.reset_index()
return data
[docs]def calculate_median_replicates(data, log="log2"):
median = None
data = data.apply(pd.to_numeric, errors='coerce')
if log == "log2":
median = np.log2(data).replace([np.inf, -np.inf], np.nan).median(axis=1, skipna=True)
elif log == "log10":
median = np.log10(median).replace([np.inf, -np.inf], np.nan).median(axis=1, skipna=True)
else:
median = data.median(axis=1)
return median
[docs]def update_groups(data, groups):
data = data.join(groups.to_frame(), on='START_ID')
return data
[docs]def get_dataset_configuration(processing_format, data_type):
config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments')
dataset_config = {}
if processing_format in config:
if data_type is not None:
if data_type in config[processing_format]:
dataset_config = config[processing_format][data_type]
else:
dataset_config = config[processing_format]
return dataset_config