#!/usr/bin/env python
"""Add reactions, genes and more to a model based on different gap-filling methods.
All (current) algorithms are separated into three steps: finding missing genes, finding missing reactions
and trying to add the found as missing entities to the model.
Available gap filling methods:
- | :py:class:`~refinegems.classes.gapfill.KEGGapFiller`
| Mainly utilises information from the KEGG database. Needs a KEGG organism ID.
| Estimated runtime: *to be determined*
- | :py:class:`~refinegems.classes.gapfill.BioCycGapFiller`
| Mainly utilises information from the BioCyc database. Requires access to BioCyc SmartTables.
| Estimated runtime: *to be determined*
- | :py:class:`~refinegems.classes.gapfill.GeneGapFiller`
| Search for gaps using the GFF file and information from SwissProt.
| Estimated runtime: *to be determined*
"""
__author__ = "Famke Baeuerle, Gwendolyn O. Döbel, Carolin Brune, Finn Mier and Dr. Reihaneh Mostolizadeh"
############################################################################
# requirements
############################################################################
from abc import ABC, abstractmethod
import cobra
from cobra.io.sbml import _f_gene, _f_gene_rev
from copy import deepcopy
import io
import json
import logging
import math
import numpy as np
import os
import pandas as pd
import re
import warnings
from bioservices.kegg import KEGG
from itertools import chain
from libsbml import Model as libModel
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Literal, Union
from tqdm import tqdm
tqdm.pandas()
from ..developement.decorators import *
from ..utility.db_access import (
get_ec_from_ncbi,
get_ec_via_swissprot,
parse_KEGG_gene,
parse_KEGG_ec,
map_to_homologs,
)
from ..utility.io import (
load_a_table_from_database,
parse_gff_for_cds,
load_model,
write_model_to_file,
)
from ..utility.entities import (
create_gp,
create_gpr,
build_reaction_bigg,
build_reaction_kegg,
build_reaction_mnx,
isreaction_complete,
parse_reac_str,
validate_reaction_compartment_bigg,
REF_COL_GF_GENE_MAP,
)
from .medium import Medium, medium_to_model
from .reports import GapFillerReport
# @Note:
# some reactions have @DEBUGGING
# enable these parts to shorten runtime during debugging (used to work on a subset,
# not on the whole input)
############################################################################
# variables
############################################################################
############################################################################
# functions
############################################################################
# Mapping for BioCyc Reactions
# ----------------------------
[docs]
def map_biocyc_to_reac(
biocyc_reacs: pd.DataFrame, use_MNX: bool = True, use_BiGG: bool = True
) -> pd.DataFrame:
"""Based on a table containing BioCyc reactions,
map them to reactions in other databases (if a mapping is possible)
Args:
- biocyc_reacs (pd.DataFrame):
Table containing BioCyc reactions information.
Should contain the columns: Reaction | Object ID | EC-Number | Spontaneous?
Can be doenloaded as a SmartTable from BioCyc.
- use_MNX (bool, optional):
Try mapping using the MetaNetX database.
Defaults to True.
- use_BiGG (bool, optional):
Try mapping using the BiGG database.
Defaults to True.
Returns:
pd.DataFrame:
The extended table.
"""
def _clean_res_row(
res_row: pd.Series, mapped2db: Literal["BiGG", "MetaNetX"]
) -> pd.Series:
"""Clean a row of a table containing mapping results from BioCyc to another database
Args:
- res_row (pd.Series):
Row containing a mapping from BioCyc to another database
- mapped2db (Literal['BiGG', 'MetaNetX']):
Database where Biocyc entries where mapped to. One of 'BiGG', 'MetaNetX'.
Returns:
pd.Series:
The cleaned row.
"""
# Mapping for equation column name for other databases
dbeq2eq = {"BiGG": "reaction_string", "MetaNetX": "mnx_equation"}
# Get list of mapped to database IDs in row if available
id_list = list(res_row[mapped2db]) if res_row[mapped2db] else None
if id_list:
# Move BioCyc IDs to references
biocyc_reac_id = res_row["id"]
res_row["id"] = str(id_list[0]) # Ensure ID is a string
# Remove ID from column 'id' from list in column 'alias'
if len(id_list) != 1:
id_list.remove(res_row["id"])
alias = id_list
else:
alias = None
# Move BioCyc IDs and alias to references
res_row["reference"] = {"metacyc.reaction": biocyc_reac_id, "alias": alias}
# Move equation from other database to equation column
res_row["equation"] = res_row[dbeq2eq.get(mapped2db)]
# Replace BioCyc in via column with mapped to database
res_row["via"] = mapped2db
return res_row
def _map_biocyc_to_mnx(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
"""
| Helper function for :py:func:`~refinegems.classes.gapfill.map_biocyc_to_reac`
| Maps Biocyc IDs in a table to the corresponding MetaNetX IDs &
| Adds information obtained via the MetaNetX IDs to the resulting table
Args:
- unmapped_reacs (pd.DataFrame):
Table containing BioCyc IDs
Returns:
pd.DataFrame:
The extended table.
"""
# Step 1: Mapping & Obtain information
# ------------------------------------
# Mapping to MetaNetX + addition of more information on MetaNetX
# Reactions
# Get MetaNetX BioCyc
mnx2biocyc_reacs = load_a_table_from_database(
"""
SELECT x.source, x.id, p.mnx_equation, p.\'ec-code\'
FROM mnx_reac_xref x INNER JOIN mnx_reac_prop p
USING(id)
WHERE x.source LIKE \'%metacyc%\'
OR x.source LIKE \'%biocyc%\'
"""
)
# Change column names to fit input table
mnx2biocyc_reacs.rename(
{"id": "MetaNetX", "source": "id"}, axis=1, inplace=True
)
# Transform id column to only contain BioCyc/MetaCyc IDs
mnx2biocyc_reacs["id"] = mnx2biocyc_reacs["id"].str.split(":", n=1).str[-1]
# Crop table to contain MetaNetX IDs set per BioCyc ID
mnx_as_list = (
mnx2biocyc_reacs.groupby("id")["MetaNetX"]
.apply(set)
.reset_index(name="MetaNetX")
)
mnx2biocyc_reacs.drop("MetaNetX", axis=1, inplace=True)
mnx2biocyc_reacs = mnx_as_list.merge(mnx2biocyc_reacs, on="id")
# Drop duplicates to get the unique BioCyc IDs
mnx2biocyc_reacs.drop_duplicates(subset="id", inplace=True)
# Merge mnx2biocyc_reacs with unmapped_reacs to get new information
reacs_mapped = mnx2biocyc_reacs.merge(unmapped_reacs, on="id", how="right")
# Step 2: Clean-up
# ----------------
# Remove all NaNs from DataFrame
reacs_mapped.replace(np.nan, None, inplace=True)
# Turn MetaNetX column in single value, rename column to id
# & if multiple MetaNetX IDs exist add to column alias
reacs_mapped = reacs_mapped.apply(_clean_res_row, args=("MetaNetX",), axis=1)
# Create list of EC codes in column ec-code_x,
# Join both ec-code columns into one & Create a set of ec-codes
reacs_mapped["ec-code_x"] = reacs_mapped["ec-code_x"].str.split(r"\s*;\s*")
reacs_mapped["ec-code_x"] = (
reacs_mapped["ec-code_x"]
.fillna({i: [] for i in reacs_mapped.index})
.map(set)
.map(list)
)
reacs_mapped["ec-code"] = (
(reacs_mapped["ec-code_x"] + reacs_mapped["ec-code_y"]).map(set).map(list)
)
# Drop all unnecessary columns
reacs_mapped.drop(
["MetaNetX", "mnx_equation", "ec-code_x", "ec-code_y"], axis=1, inplace=True
)
# Step 3: Return result
# ---------------------
return reacs_mapped
def _map_biocyc_to_bigg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
"""
| Helper function for :py:func:`~refinegems.classes.gapfill.map_biocyc_to_reac`
| Maps Biocyc IDs in a table to the corresponding BiGG IDs &
| Adds information obtained via the BiGG IDs to the resulting table
Args:
- unmapped_reacs (pd.DataFrame):
Table containing BioCyc IDs
Returns:
pd.DataFrame:
The extended table.
"""
# Step 1: Mapping & Obtain information
# ------------------------------------
# Mapping to BiGG + addition of more information on BiGG Reactions
# Get BiGG BioCyc
bigg2biocyc_reacs = load_a_table_from_database(
"SELECT id, BioCyc, name, reaction_string from bigg_reactions"
)
# Filter BiGG table to contain only reaction IDs with valid compartments
mask = bigg2biocyc_reacs.apply(
lambda x: validate_reaction_compartment_bigg(
parse_reac_str(x["reaction_string"], "BiGG")[2]
),
axis=1,
)
mask.replace("exchange", True, inplace=True)
bigg2biocyc_reacs = bigg2biocyc_reacs[mask]
# Change column names to fit input table
bigg2biocyc_reacs.rename({"id": "BiGG", "BioCyc": "id"}, axis=1, inplace=True)
# Crop table to contain BiGG IDs set per BioCyc ID
bigg_as_list = (
bigg2biocyc_reacs.groupby("id")["BiGG"].apply(set).reset_index(name="BiGG")
)
bigg2biocyc_reacs.drop("BiGG", axis=1, inplace=True)
bigg2biocyc_reacs = bigg_as_list.merge(bigg2biocyc_reacs, on="id")
# Drop duplicates to get the unique BioCyc IDs
bigg2biocyc_reacs.drop_duplicates(subset="id", inplace=True)
# Merge bigg2biocyc_reacs with unmapped_reacs to get new information
reacs_mapped = bigg2biocyc_reacs.merge(unmapped_reacs, on="id", how="right")
# Step 2: Clean-up
# ----------------
# Remove all NaNs from DataFrame
reacs_mapped.replace(np.nan, None, inplace=True)
# Turn BiGG column in single value, rename column to id
# & if multiple BiGG IDs exist add to column alias
reacs_mapped = reacs_mapped.apply(_clean_res_row, args=("BiGG",), axis=1)
# Drop all unnecessary columns
reacs_mapped.drop(["BiGG", "reaction_string", "name"], axis=1, inplace=True)
# Step 3: Return result
# ---------------------
return reacs_mapped
# Step 1: Get missing reactions without GPR
# -----------------------------------------
mask = biocyc_reacs["add_to_GPR"].isna()
biocyc_reacs_to_map = biocyc_reacs[mask]
# Step 2: Mapping
# ---------------
# Map to MetaNetX
if use_MNX:
biocyc_reacs_to_map = _map_biocyc_to_mnx(biocyc_reacs_to_map)
# Map to BiGG
if use_BiGG:
to_map = biocyc_reacs_to_map[biocyc_reacs_to_map["via"] == "BioCyc"]
if len(to_map) > 0:
to_map = _map_biocyc_to_bigg(to_map)
biocyc_reacs_to_map = pd.concat(
[to_map, biocyc_reacs_to_map[~(biocyc_reacs_to_map["via"] == "BioCyc")]]
)
# Step 3: Gather result(s)
# ------------------------
# Merge all tables if necessary
biocyc_reacs = pd.concat(
[biocyc_reacs_to_map, biocyc_reacs[~biocyc_reacs["add_to_GPR"].isna()]],
sort=True,
ignore_index=True,
)
# Return results
return biocyc_reacs
# Mapping of EC numbers
# ---------------------
[docs]
def map_ec_to_reac(
table: pd.DataFrame,
use_MNX: bool = True,
use_BiGG: bool = True,
use_KEGG: bool = True,
threshold_add_reacs: int = 5,
) -> pd.DataFrame:
"""Based on a table of NCBI protein IDs and EC numbers,
map them to reactions via different databases
(if a mapping is possible).
input table should have format:
``ec-code | ncbiprotein``
output table has the format:
``ec-code | ncbiprotein | id | equation | reference | is_transport | via``
Args:
- table (pd.DataFrame):
The input table.
- use_MNX (bool, optional):
Try mapping using the MetaNetX database.
Defaults to True.
- use_BiGG (bool, optional):
Try mapping using the BiGG database.
Defaults to True.
- use_KEGG (bool, optional):
Try mapping using the KEGG database.
Defaults to True.
- threshold_add_reacs (int, optional):
Maximum number of reactions allowed per EC number per ncbiprotein ID
to be added. Otherwise skip addition of reactions due to insufficient evidence
Defaults to 5.
Returns:
pd.DataFrame:
The extended table.
"""
def _map_ec_to_reac_mnx(
unmapped_reacs: pd.DataFrame, threshold_add_reacs: int = 5
) -> pd.DataFrame:
"""Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
for the mapping using the MetaNetX database.
Args:
- unmapped_reacs (pd.DataFrame):
The input table (ec-code and ncbiprotein columns)
- threshold_add_reacs (int, optional):
Maximum number of reactions allowed per EC number per ncbiprotein ID
to be added. Otherwise skip addition of reactions due to insufficient evidence
Defaults to 5.
Returns:
pd.DataFrame:
The extended table
"""
# input: pd.DataFrame with at least the ec-code column
# load MNX reac prop table
mnx_reac_prop = load_a_table_from_database("mnx_reac_prop", False)
# convert table into one EC-number per row
mnx_reac_prop.drop("is_balanced", inplace=True, axis=1)
mnx_reac_prop["ec-code"] = mnx_reac_prop["ec-code"].apply(
lambda x: x.split(r";") if isinstance(x, str) else None
)
# exclude entries without EC-number
mnx_reac_prop = mnx_reac_prop.explode("ec-code").dropna(subset="ec-code")
# merge with unmapped reactions
reacs_mapped = unmapped_reacs.merge(mnx_reac_prop, on="ec-code", how="left")
# filter out mappings with more than x merged reactions, to ensure quality
reacs_mapped = reacs_mapped.explode("ncbiprotein")
reacs_mapped = reacs_mapped.groupby(["ncbiprotein", "ec-code"]).filter(
lambda x: len(x) < threshold_add_reacs
)
reacs_mapped = reacs_mapped.merge(
unmapped_reacs.explode("ncbiprotein"),
on=["ec-code", "ncbiprotein"],
how="outer",
)
# rename columns and cleanup
reacs_mapped.rename({"mnx_equation": "equation"}, inplace=True, axis=1)
reacs_mapped = (
reacs_mapped.groupby(["ec-code", "id"])
.agg(
{
"ncbiprotein": lambda x: x.tolist(),
"equation": "first",
"reference": "first",
"is_transport": "first",
}
)
.reset_index()
)
reacs_mapped = reacs_mapped.reindex(
columns=[
"ec-code",
"ncbiprotein",
"id",
"equation",
"reference",
"is_transport",
]
)
reacs_mapped["via"] = reacs_mapped["id"].apply(
lambda x: "MetaNetX" if x else None
)
return reacs_mapped
def _map_ec_to_reac_bigg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
"""Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
for the mapping using the BiGG database.
Args:
- unmapped_reacs (pd.DataFrame):
The input table (ec-code and ncbiprotein columns)
Returns:
pd.DataFrame:
The extended table
"""
# load BiGG reaction namespace
bigg_reacs = load_a_table_from_database("bigg_reactions", False)
bigg_reacs.dropna(subset="EC Number", inplace=True)
bigg_reacs = bigg_reacs[["id", "reaction_string", "EC Number"]].rename(
{"reaction_string": "equation", "EC Number": "ec-code"},
inplace=False,
axis=1,
)
bigg_reacs["ec-code"] = bigg_reacs["ec-code"].apply(
lambda x: x.split(r",") if isinstance(x, str) else None
)
bigg_reacs = bigg_reacs.explode("ec-code")
# merge with unmapped reactions
bigg_mapping = unmapped_reacs.merge(bigg_reacs, on=["ec-code"], how="left")
bigg_mapping.mask(bigg_mapping.isna(), other=None, inplace=True)
# make conform to format
bigg_mapping["reference"] = None
bigg_mapping["is_transport"] = None
bigg_mapping["via"] = bigg_mapping["id"].apply(lambda x: "BiGG" if x else None)
return bigg_mapping
def _map_ec_to_reac_kegg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
"""Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
for the mapping using the KEGG database.
Args:
- unmapped_reacs (pd.DataFrame):
The input table (ec-code and ncbiprotein columns)
Returns:
pd.DataFrame:
The extended table
"""
# get KEGG EC number information
eccodes = unmapped_reacs["ec-code"]
kegg_mapped = pd.DataFrame.from_dict(
list(eccodes.progress_apply(parse_KEGG_ec))
)
kegg_mapped = unmapped_reacs.merge(kegg_mapped, on="ec-code")
kegg_mapped["is_transport"] = None
kegg_mapped["via"] = kegg_mapped["id"].apply(lambda x: "KEGG" if x else None)
kegg_mapped.explode(column="id")
return kegg_mapped
# input table should have format:
# ec-code | ncbiprotein
# one EC number per row, list of ncbiprotein per row allowed
if (
len(table.columns) != 2
or "ec-code" not in table.columns
or "ncbiprotein" not in table.columns
):
raise ValueError("Wrong table format. Cannot map EC to reaction.")
# map to MetaNetX
if use_MNX:
table = _map_ec_to_reac_mnx(table, threshold_add_reacs)
# map to BiGG
if use_BiGG:
if "id" in table.columns:
to_map = table[table["id"].isna()][["ec-code", "ncbiprotein"]]
if len(to_map) > 0:
to_map = _map_ec_to_reac_bigg(to_map)
table = pd.concat([to_map, table[~table["id"].isna()]])
else:
table = _map_ec_to_reac_bigg(table)
# map to KEGG
if use_KEGG:
if "id" in table.columns:
to_map = table[table["id"].isna()][["ec-code", "ncbiprotein"]]
if len(to_map) > 0:
to_map = _map_ec_to_reac_kegg(to_map)
table = pd.concat([to_map, table[~table["id"].isna()]])
else:
table = _map_ec_to_reac_kegg(table)
# explode
table = table.explode("id", ignore_index=True).explode("id", ignore_index=True)
# output: ec-code ncbiprotein id equation reference is_transport via
return table
############################################################################
# classes
############################################################################
# -------------
# abtract class
# -------------
[docs]
class GapFiller(ABC):
"""Abstract base class for the gap filling.
Already includes functions for the "filling" part of the gap-filling approach
and some helper functions. Each subclass needs an implementation of `find_missing_genes`
and `find_missing_reactions` to determine the entities, that are missing in the model.
Attributes:
- full_gene_list (list):
List of all the genes.
- missing_genes:
DataFrame containing all genes found as missing with additional information. <br>
``ncbiprotein | locus_tag | <optional columns>``
- missing_reactions:
DataFrame containing all reactions found as missing with additional information.<br>
``ec-code | ncbiprotein | id | equation | reference | <is_transport> | via | add_to_GPR``
- geneid_type (str):
What type of gene ID the model contains.
Defaults to 'ncbi'.
- _statistics (dict):
Dictionary of statistical information of the gap-filling run. Includes e.g.
the number of added genes and reactions.
- manual_curation (dict):
Dictionary of reaction and gene IDs to be used for manual curation.
"""
[docs]
def __init__(self) -> None:
# data
self.full_gene_list = None
self.missing_genes = (
None # missing genes, that have not yet been sorted into any category
)
self.missing_reactions = (
None # missing reacs, that have not yet been sorted into any category
)
# general information
self.geneid_type = "ncbi"
self._variety = "Undefined" # Specifies the variety of the gapfiller, e.g. 'BioCyc', 'KEGG', 'GFF + SwissProt'
# collect stats & Co, can be extended by subclasses
self._statistics = {
"genes": {
"missing (total)": 0,
"unmappable": 0,
"missing (mappable)": 0,
"duplicated": 0,
"added": 0,
"building failed": 0,
"missing (remaining)": 0,
},
"reactions": {
"missing (based on genes)": 0,
"already in model": 0,
"missing (total)": 0,
"mapped to MNX": 0,
"mapped to BiGG": 0,
"mapped to KEGG": 0,
"unmappable": 0,
"added": 0,
"building failed": 0,
"missing (remaining)": 0,
},
}
self.manual_curation = {"genes": {}, "reactions": {}}
# abstract methods
# ----------------
[docs]
@abstractmethod
def find_missing_genes(self, model):
"""Find missing genes in the model. Parameters can be extended as needed.
Needs to save a table in the format
``ncbiprotein | locus_tag | <optional columns>``
to the attribute missing_genes.
"""
pass
[docs]
@abstractmethod
def find_missing_reactions(self, model):
"""Find missing reactions in the model. Parameters can be extended as needed.
Needs to save a table of the format
``ec-code | ncbiprotein | id | equation | reference | <is_transport> | via | add_to_GPR``
to the attribute missing_reactions.
Method specific information can be added to the reference column, which is
expected to contain a dictionary. The 'via' column describes the way
the database will be added to the model.
"""
pass
# finding the gaps
# ----------------
[docs]
def _find_reac_in_model(
self,
model: cobra.Model,
eccode: str,
id: str,
idtype: Literal["MetaNetX", "KEGG", "BiGG", "BioCyc"],
include_ec_match: bool = False,
) -> Union[None, list]:
"""Helper function of :py:class:`~refinegems.classes.gapfill.GapFiller`.
Search the model for an ID (and optionally EC number), to determine, if the
reaction is in the model.
Args:
- model (cobra.Model):
The model loaded with COBRApy.
- eccode (str):
The EC number in the format: X.X.X.X
- id (str):
The ID to search for.
- idtype (Literal['MetaNetX','KEGG','BiGG', 'BioCyc']):
Name of the database the ID belongs to.
- include_ec_match (bool, optional):
Option to include a match if only the EC number matches.
Defaults to False.
Returns:
(1) Case one or more match found:
list:
List of the ID of the reactions in the model, that match
the query.
(2) Case no match:
None:
Nothing found.
"""
MAPPING = {
"MetaNetX": "metanetx.reaction",
"KEGG": "kegg.reaction",
"BiGG": "bigg.reaction",
"BioCyc": "metacyc.reaction",
}
found = []
for r in model.reactions:
if MAPPING[idtype] in r.annotation.keys():
if (
isinstance(r.annotation[MAPPING[idtype]], list)
and id in r.annotation[MAPPING[idtype]]
):
found.append(r.id)
elif (
isinstance(r.annotation[MAPPING[idtype]], str)
and id == r.annotation[MAPPING[idtype]]
):
found.append(r.id)
if include_ec_match and eccode and "ec-code" in r.annotation.keys():
if (
isinstance(r.annotation["ec-code"], list)
and eccode in r.annotation["ec-code"]
):
found.append(r.id)
elif (
isinstance(r.annotation["ec-code"], str)
and eccode == r.annotation["ec-code"]
):
found.append(r.id)
found = list(set(found))
if len(found) > 0:
return found
return None
# actual "Filling" part
# ---------------------
[docs]
def add_genes_from_table(self, model: libModel, gene_table: pd.DataFrame) -> None:
"""Create new GeneProduct for a table of genes in the format:
| ncbiprotein | locus_tag | UniProt | ... |
The dots symbolise additional columns, that can be passed to the function,
but will not be used by it. The other columns, except UniProt, are required.
Args:
- model (libModel):
The model loaded with libSBML.
- gene_table (pd.DataFrame):
The table with the genes to add. At least needs the columns
*ncbiprotein* and *locus_tag*. Optional columns include
*UniProt* amongst other.
"""
# ncbiprotein | locus_tag | ...
# work on a copy to ensure input stays the same
gene_table = gene_table.copy()
# gene_table.drop(columns=['ec-code'],inplace=True)
# create gps from the table and add them to the model
cols_for_refs = [
_ for _ in REF_COL_GF_GENE_MAP.keys() if _ in gene_table.columns
]
# create gp
for idx, x in tqdm(
gene_table.iterrows(), total=len(gene_table), desc="Adding genes to model"
):
# get additional references
references = dict()
for dbname in cols_for_refs:
references[dbname] = (x[dbname], True)
create_gp(
model, x["ncbiprotein"], locus_tag=x["locus_tag"], reference=references
)
self._statistics["genes"]["added"] += 1
[docs]
def add_gene_reac_associations_from_table(
self, model: libModel, reac_table: pd.DataFrame
) -> None:
"""Using a table with at least the columns 'ncbiprotein'
(containing e.g. NCBI protein identifier (lists), should be gene IDs in the model)
and 'add_to_GPR' (containing reactions identifier (lists)), add the gene IDs to the
GPRs of the corresponding reactions.
Args:
- model (libModel):
The model loaded with libSBML.
- reac_table (pd.DataFrame):
The table containing at least the columns 'ncbiprotein' (gene IDs) and
'add_to_GPR' (reaction IDs)
"""
model_gene_ids = [_.getId() for _ in model.getPlugin(0).getListOfGeneProducts()]
# get each unique ncbiprotein vs reaction mapping
reac_table = reac_table[["ncbiprotein", "add_to_GPR"]]
reac_table = reac_table.explode("ncbiprotein")
reac_table.drop_duplicates(subset="ncbiprotein", inplace=True)
# add the genes to the corresponding GPRs
for idx, row in reac_table.iterrows():
# check, if G_+ncbiprotein in model
# if yes, add gpr
geneid = _f_gene_rev(row["ncbiprotein"])
for reacid in row["add_to_GPR"]:
current_reacid = "R_" + reacid
current_mgids = [mgid for mgid in model_gene_ids if geneid in mgid]
if current_mgids:
if len(current_mgids) == 1:
create_gpr(model.getReaction(current_reacid), current_mgids[0])
else:
mes = f"Found multiple matches for {geneid} in model: {current_mgids}. Belongs to reaction {current_reacid}."
warnings.warn(mes, UserWarning)
# else, print warning
else:
mes = f"Cannot find {geneid} in model. Should be added to {current_reacid}."
warnings.warn(mes, UserWarning)
[docs]
def add_reactions_from_table(
self,
model: cobra.Model,
missing_reac_table: pd.DataFrame,
formula_check: Literal["none", "existence", "wildcard", "strict"] = "existence",
exclude_dna: bool = True,
exclude_rna: bool = True,
idprefix: str = "refineGEMs",
namespace: Literal["BiGG"] = "BiGG",
) -> pd.DataFrame:
"""Helper function to add reactions to a model from the missing_reactions table
(output of the chosen implementation of :py:meth:`~refinegems.classes.gapfill.GapFiller.find_missing_reactions`)
Args:
- model (cobra.Model):
The model, loaded with COBRpy.
- missing_reac_table (pd.DataFrame):
The missing reactions table.
- formula_check (Literal['none','existence','wildcard','strict'], optional):
Param for checking metabolite formula before adding them to the model.
For more information, refer to :py:func:`~refinegems.utility.entities.isreaction_complete`
Defaults to 'existence'.
- exclude_dna (bool, optional):
Option to exclude reaction containing 'DNA' from being added to the model.
Defaults to True.
- exclude_rna (bool, optional):
Option to exclude reaction containing 'RNA' from being added to the model.
Defaults to True.
- idprefix (str, optional):
A prefix to use, if pseudo-IDs need to be created.
Defaults to 'refineGEMs'.
- namespace (Literal['BiGG'], optional):
Namespace to use for the reactions and metabolites
(and the model). Defaults to 'BiGG'.
Raises:
- TypeError: Unknown return type for reac param. Please contact the developers.
Returns:
pd.DataFrame:
Table containing the information about which genes can now be added
to reactions (use for GPR curation).
"""
# reconstruct reactions
# ---------------------
for idx, row in tqdm(
missing_reac_table.iterrows(),
desc="Trying to add missing reacs",
total=missing_reac_table.shape[0],
):
# add EC number to references
if row["reference"]:
refs = row["reference"]
if isinstance(refs, dict):
continue
elif refs[0] == "{":
refs = refs.replace(r"'", r"\"")
refs = json.loads(refs)
else:
refs = refs.split(r":")
refs = {refs[0]: refs[1]}
else:
refs = {}
if row["ec-code"]:
if "ec-code" in refs.keys():
if not isinstance(refs["ec-code"], list):
refs["ec-code"] = [refs["ec-code"]]
if isinstance(row["ec-code"], list):
refs["ec-code"] = list(set(refs["ec-code"] + row["ec-code"]))
elif isinstance(row["ec-code"], str):
refs["ec-code"] = list(
set(refs["ec-code"].append(row["ec-code"]))
)
else:
warnings.warn(
f'Unknown type for ec-code: {type(row["ec-code"])}',
UserWarning,
)
else:
refs["ec-code"] = (
row["ec-code"]
if isinstance(row["ec-code"], list)
else [row["ec-code"]]
)
# build reaction
reac = None
match row["via"]:
# MetaNetX
case "MetaNetX":
reac = build_reaction_mnx(
model,
row["id"],
reac_str=str(row["equation"]),
references=refs,
idprefix=idprefix,
namespace=namespace,
)
# KEGG
case "KEGG":
reac = build_reaction_kegg(
model,
row["id"],
reac_str=str(row["equation"]),
references=refs,
idprefix=idprefix,
namespace=namespace,
)
# BiGG
case "BiGG":
reac = build_reaction_bigg(
model,
row["id"],
references=refs,
idprefix=idprefix,
namespace=namespace,
)
# Unknown database
case _:
mes = f"""Unknown database name for reaction reconstruction: {row["via"]}\n
Reaction will not be reconstructed."""
warnings.warn(mes, UserWarning)
# check output of reconstruction
# ------------------------------
# case 1: reconstruction was not possible
if not reac:
self._statistics["reactions"]["building failed"] += 1
pass # nothing to do here
# case 2: reaction(s) found in model
elif isinstance(reac, list):
# add found names to the add_to_GPR column of the table
current_gpr = missing_reac_table.loc[idx, "add_to_GPR"]
if str(current_gpr) == "nan":
current_gpr = None
if not current_gpr:
missing_reac_table.at[idx, "add_to_GPR"] = reac
else:
missing_reac_table.at[idx, "add_to_GPR"] = list(
set(reac + current_gpr)
)
# case 3: new reaction was generated
elif isinstance(reac, cobra.Reaction):
# validate reaction
if isreaction_complete(
reac,
formula_check=formula_check,
exclude_dna=exclude_dna,
exclude_rna=exclude_rna,
):
# Extend reaction notes with information about the GapFiller
reac.notes["found with"] = f"refineGEMs GapFiller, {self._variety}"
# add reaction to model (if validation successful)
model.add_reactions([reac])
self._statistics["reactions"]["added"] += 1
# add reaction ID to table under add_to_GPR
current_gpr = missing_reac_table.loc[idx, "add_to_GPR"]
if pd.isna(current_gpr):
missing_reac_table.at[idx, "add_to_GPR"] = [reac.id]
else:
current_gpr.append(reac.id)
missing_reac_table.at[idx, "add_to_GPR"] = list(
set(current_gpr)
)
# case 4: should never occur
else:
mes = f"Unknown return type for reac param. Please contact the developers."
raise TypeError(mes)
# save reactions, that could not be recontructed, for manual curation
manual_curation_reacs = missing_reac_table[
missing_reac_table["add_to_GPR"].isnull()
]
self.manual_curation["reactions"]["building failed"] = manual_curation_reacs
self._statistics["reactions"]["building failed"] = manual_curation_reacs[
"id"
].nunique()
# return the updated table with successfully reconstructed reaction ids
# to enable adding the genes
missing_gprs = missing_reac_table[~missing_reac_table["add_to_GPR"].isnull()]
return missing_gprs
[docs]
def fill_model(self, model: Union[cobra.Model, libModel], **kwargs) -> libModel:
"""Based on a table of missing genes and missing reactions,
fill the gaps in a model as good as possible automatically.
.. note::
This model rewrites and reloads the input model. Only the returned model
has all the edits.
Args:
- model (Union[cobra.Model,libModel]):
The model, either a libSBML or COBRApy model entity.
- kwargs:
Additional parameters to be passed to
:py:meth:`~refinegems.classes.gapfill.GapFiller.add_reactions_from_table`.
Raises:
- TypeError: Unknown type of model.
Returns:
libModel:
The gap-filled model.
"""
# Step 0: Preparations
# --------------------
# load the correct type of model for the first step
match model:
case cobra.Model():
with NamedTemporaryFile(suffix=".xml", delete=False) as tmp:
write_model_to_file(model, tmp.name)
model = load_model(tmp.name, "libsbml")
os.remove(tmp.name)
case libModel():
pass
case _:
mes = f"Unknown type of model: {type(model)}"
raise TypeError(mes)
# Check if missing reactions found, if not return model
if not isinstance(self.missing_reactions, pd.DataFrame) or self.missing_reactions.empty:
return model
# Filter out reactions without ncbiprotein
self.manual_curation["reactions"]["no GPR"] = self.missing_reactions[
self.missing_reactions["ncbiprotein"].isnull()
]
self._statistics["reactions"]["unmappable"] += self.manual_curation[
"reactions"
]["no GPR"]["id"].nunique()
self.missing_reactions = self.missing_reactions[
~self.missing_reactions["ncbiprotein"].isnull()
]
# Filter out genes without ncbiprotein
self.manual_curation["genes"]["no ncbiprotein"] = self.missing_genes[
self.missing_genes["ncbiprotein"].isnull()
]
num_genes_no_ncbi_id = self.manual_curation["genes"]["no ncbiprotein"][
"locus_tag"
].nunique()
self._statistics["genes"]["unmappable"] += num_genes_no_ncbi_id
self._statistics["genes"]["missing (mappable)"] -= num_genes_no_ncbi_id
self.missing_genes = self.missing_genes[
~self.missing_genes["ncbiprotein"].isnull()
]
# filter out duplicated genes to avoid duplicated IDs in the model
if len(self.missing_genes) != len(self.missing_genes["ncbiprotein"].unique()):
self.manual_curation["genes"]["duplicated (not added)"] = (
self.missing_genes[
self.missing_genes.duplicated(subset=["ncbiprotein"])
]
)
self._statistics["genes"]["duplicated"] = self.manual_curation["genes"][
"duplicated (not added)"
]["locus_tag"].nunique()
self.missing_genes = self.missing_genes[
~self.missing_genes.duplicated(subset=["ncbiprotein"])
]
# Step 1: Add genes to model whose reactions are already in it
# -------------------------------------------------------------
# filter the respective genes and reactions
reacs_in_model = self.missing_reactions[
~(self.missing_reactions["add_to_GPR"].isnull())
]
ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model["ncbiprotein"]))]
genes_with_reacs_in_model = self.missing_genes[
self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model)
]
if len(genes_with_reacs_in_model) > 0:
# add genes as gene products to model
self.add_genes_from_table(model, genes_with_reacs_in_model)
# extend gene production rules
self.add_gene_reac_associations_from_table(model, reacs_in_model)
# what remains:
self.missing_reactions = self.missing_reactions[
self.missing_reactions["add_to_GPR"].isnull()
]
self.missing_genes = self.missing_genes[
~(self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model))
] #
# Step 2: Add reactions to model, if reconstruction successful
# ------------------------------------------------------------
if len(self.missing_reactions) > 0:
# re-load model with cobrapy
with NamedTemporaryFile(suffix=".xml", delete=False) as tmp:
write_model_to_file(model, tmp.name)
model = load_model(tmp.name, "cobra")
os.remove(tmp.name)
# .......................
# @DEBUG
# if len(self.missing_reactions) > 10:
# self.missing_reactions = self.missing_reactions.sample(20)
# print('fill_model: Running in debugging mode')
# .......................
# add reactions to model
missing_gprs = self.add_reactions_from_table(
model, self.missing_reactions, **kwargs
)
# Step 3: Add GPRs + genes for the newly curated reactions
# --------------------------------------------------------
# re-load model with libsbml
with NamedTemporaryFile(suffix=".xml", delete=False) as tmp:
write_model_to_file(model, tmp.name)
model = load_model(tmp.name, "libsbml")
os.remove(tmp.name)
try:
if len(missing_gprs) > 0:
# filter for genes for GPRs but not yet in model
ncbiprot_with_reacs_in_model = [
*chain(*list(missing_gprs["ncbiprotein"]))
]
genes_with_reacs_in_model = self.missing_genes[
self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model)
]
if len(genes_with_reacs_in_model) > 0:
# add genes as gene products to model
self.add_genes_from_table(model, genes_with_reacs_in_model)
# extend gene production rules
reacs_in_model = self.missing_reactions[
~(self.missing_reactions["add_to_GPR"].isnull())
]
self.add_gene_reac_associations_from_table(model, reacs_in_model)
self.missing_genes = self.missing_genes[
~(
self.missing_genes["ncbiprotein"].isin(
ncbiprot_with_reacs_in_model
)
)
]
except NameError:
warnings.warn("Something went wrong. Contact the developers (gapfillmodel)")
# collect stats and stuff for manual curation
self.manual_curation["genes"]["building failed"] = self.missing_genes
self._statistics["genes"]["building failed"] = self.missing_genes[
"locus_tag"
].nunique()
# calculate remaining statistics
self._statistics["reactions"]["missing (remaining)"] += self._statistics[
"reactions"
]["building failed"]
self._statistics["genes"]["missing (remaining)"] += self._statistics["genes"][
"building failed"
]
return model
[docs]
def report(
self, dir: str, hide_zeros: bool = False, no_title: bool = False
) -> None:
"""Based on the previous gap-filling, save statistics and missing genes/reactions for manual curation.
Args:
- dir (str):
Path to a directory to save the report to.
- hide_zeros (bool, optional):
Option to hide statistics with zero counts. Defaults to False.
- with_title (bool, optional):
Option to get figure without title. Defaults to False.
"""
logging.warning(
"Please keep in mind that all statistical values are determined while running the tool "
+ "and only unique values are tracked."
)
statistics_report = GapFillerReport(
self._variety,
deepcopy(self._statistics),
self.manual_curation,
hide_zeros,
no_title,
)
statistics_report.save(Path(dir))
# --------------------
# Gapfilling with KEGG
# --------------------
[docs]
class KEGGapFiller(GapFiller):
"""Based on a KEGG organism ID (corresponding to the organism of the model),
find missing genes in the model and map them to reactions to try and fill the gaps
found with the KEGG database.
.. note::
Please keep in mind that using this module requires a model containing the Genbank locus tags as labels.
If your model does not conform to this you can use one of the functions
:py:func:`~refinegems.curation.curate.polish_model` or
:py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table`.
.. hint::
Due to the KEGG REST API this is relatively slow.
Attributes:
- GapFiller Attributes:
All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
- organismid (str, required):
Abbreviation of the organism in the KEGG database.
"""
[docs]
def __init__(self, organismid) -> None:
super().__init__()
self.organismid = organismid
self._variety = "KEGG"
[docs]
def find_missing_genes(self, model: libModel):
"""Get the missing genes in model in comparison to the KEGG entry of the
organism. Saves a table containing the missing genes
to the attribute missing_genes.
Format:
``orgid:locus | locus_tag | kegg.orthology | ec-code | ncbiprotein | uniprot``
Args:
- model (libModel):
The model loaded with libSBML.
"""
# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis/entities --- Modified
def get_model_genes(model: libModel) -> pd.DataFrame:
"""Extracts KEGG Genes from given model
Args:
- model (model-libsbml):
Model loaded with libSBML
Returns:
pd.DataFrame:
Table with all KEGG Genes in the model
"""
genes_in_model = []
for gene in model.getPlugin(0).getListOfGeneProducts():
cv_terms = gene.getCVTerms()
if cv_terms:
for cv_term in cv_terms:
for idx in range(cv_term.getNumResources()):
uri = cv_term.getResourceURI(idx)
if "kegg.genes" in uri:
genes_in_model.append(
re.split(r"kegg.genes:|kegg.genes/", uri)[1]
) # work with old/new pattern
return pd.DataFrame(genes_in_model, columns=["orgid:locus"])
# Step 1: get genes from model
# ----------------------------
genes_in_model = get_model_genes(model)
# Step 2: get genes of organism from KEGG
# ---------------------------------------
gene_KEGG_list = KEGG().list(self.organismid)
gene_KEGG_table = pd.read_table(io.StringIO(gene_KEGG_list), header=None)
gene_KEGG_table.columns = ["orgid:locus", "CDS", "position", "protein"]
self.full_gene_list = gene_KEGG_table
gene_KEGG_table = gene_KEGG_table[["orgid:locus"]]
# Statistics on full gene list based on KEGG
self._statistics["genes"][f"total (based on {self._variety})"] = (
self.full_gene_list["orgid:locus"].nunique()
+ int(self.full_gene_list["orgid:locus"].isna().sum())
)
# Step 3: KEGG vs. model genes -> get missing genes for model
# ----------------------------
genes_not_in_model = gene_KEGG_table[
~gene_KEGG_table["orgid:locus"].isin(genes_in_model["orgid:locus"])
]
# Step 4: extract locus tag
# -------------------------
genes_not_in_model["locus_tag"] = (
genes_not_in_model["orgid:locus"].str.split(r":").str[1]
)
# Step 5: map to EC via KEGG
# --------------------------
# @DEBUG .......................
# genes_not_in_model = genes_not_in_model.iloc[0:50,:]
# print(UserWarning('Running in debugging mode.'))
# ..............................
geneKEGG_mapping = pd.DataFrame.from_dict(
list(genes_not_in_model["orgid:locus"].progress_apply(parse_KEGG_gene))
)
genes_not_in_model = genes_not_in_model.merge(
geneKEGG_mapping, how="left", on="orgid:locus"
)
genes_not_in_model = genes_not_in_model.explode("ncbiprotein")
# collect stats
self._statistics["genes"]["missing (total)"] = genes_not_in_model[
"locus_tag"
].nunique()
self.missing_genes = genes_not_in_model
[docs]
def find_missing_reactions(self, model: cobra.Model, threshold_add_reacs: int = 5):
# Step 1: filter missing gene list + extract ECs
# ----------------------------------------------
# Filter out unmappable genes that have whether NCBI protein IDs nor EC numbers
mask = (
self.missing_genes["ec-code"].isnull()
& self.missing_genes["ncbiprotein"].isnull()
)
self.manual_curation["genes"]["no ncbiprotein, no EC"] = self.missing_genes[
mask
]
self.missing_genes = self.missing_genes[~mask]
self._statistics["genes"]["unmappable"] = self.manual_curation["genes"][
"no ncbiprotein, no EC"
]["locus_tag"].nunique()
# Filter out remaining unmappable genes due to missing EC numbers
mask = self.missing_genes["ec-code"].isnull()
self.manual_curation["genes"]["no EC"] = self.missing_genes[mask]
self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
"no EC"
]["locus_tag"].nunique()
self.missing_genes = self.missing_genes[~mask]
# collect overall remaining statistics on genes
self._statistics["genes"]["missing (mappable)"] = self.missing_genes[
"locus_tag"
].nunique()
self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
"unmappable"
]
# get relevant infos for reacs
self.missing_reactions = self.missing_genes[["ec-code", "ncbiprotein"]]
# check, if any automatic gapfilling is possible
if self.missing_reactions.empty:
logging.warning(
f"No missing reactions for the provided model {model.id} were found via {self._variety}."
)
return None
# transform table into EC-number vs. list of NCBI protein IDs
eccode = (
self.missing_reactions["ec-code"]
.apply(pd.Series)
.reset_index()
.melt(id_vars="index")
.dropna()[["index", "value"]]
.set_index("index")
)
ncbiprot = (
self.missing_reactions["ncbiprotein"]
.apply(pd.Series)
.reset_index()
.melt(id_vars="index")
.dropna()[["index", "value"]]
.set_index("index")
)
self.missing_reactions = pd.merge(
eccode, ncbiprot, left_index=True, right_index=True
).rename(columns={"value_x": "ec-code", "value_y": "ncbiprotein"})
self.missing_reactions = (
self.missing_reactions.groupby(self.missing_reactions["ec-code"])
.aggregate({"ncbiprotein": "unique"})
.reset_index()
)
# @DEBUG .......................
# self.missing_reactions = self.missing_reactions.iloc[10:30,:]
# print(UserWarning('Running in debugging mode.'))
# ..............................
# Step 2: map EC to reaction(s) if possible
# -----------------------------------------
# via MNX, BiGG, KEGG
reacs_mapped = map_ec_to_reac(self.missing_reactions, threshold_add_reacs)
# Statistics on missing based on genes
self._statistics["reactions"]["missing (based on genes)"] = reacs_mapped[
"id"
].nunique() + int(reacs_mapped["id"].isna().sum())
# Step 3: clean and map to model reactions
# ----------------------------------------
# need manual curation
self.manual_curation["reactions"]["no ID"] = reacs_mapped[
reacs_mapped["id"].isnull()
]
self._statistics["reactions"]["unmappable"] = int(
self.manual_curation["reactions"]["no ID"]["id"].isna().sum()
)
# map to model reactions
reacs_mapped = reacs_mapped[~reacs_mapped["id"].isnull()]
gpr = reacs_mapped.apply(
lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
axis=1,
)
if type(gpr) == object:
reacs_mapped["add_to_GPR"] = gpr
else:
reacs_mapped["add_to_GPR"] = None
# statistics on reactions already in model
self._statistics["reactions"]["already in model"] = reacs_mapped[
~reacs_mapped["add_to_GPR"].isnull()
]["id"].nunique()
# statistics on mapped reactions
self._statistics["reactions"]["mapped to MNX"] = reacs_mapped[
reacs_mapped["add_to_GPR"].isnull()
][reacs_mapped["via"] == "MetaNetX"]["id"].nunique()
self._statistics["reactions"]["mapped to BiGG"] = reacs_mapped[
reacs_mapped["add_to_GPR"].isnull()
][reacs_mapped["via"] == "BiGG"]["id"].nunique()
self._statistics["reactions"]["mapped to KEGG"] = reacs_mapped[
reacs_mapped["add_to_GPR"].isnull()
][reacs_mapped["via"] == "KEGG"]["id"].nunique()
# calculate remaining statistics
self._statistics["reactions"]["missing (total)"] = (
self._statistics["reactions"]["missing (based on genes)"]
- self._statistics["reactions"]["already in model"]
)
self._statistics["reactions"]["missing (remaining)"] = self._statistics[
"reactions"
]["unmappable"]
# return missing_reactions
self.missing_reactions = reacs_mapped
# ----------------------
# Gapfilling with BioCyc
# ----------------------
[docs]
class BioCycGapFiller(GapFiller):
"""
| Based on a SmartTable with information on the genes and a SmartTable with
| information on the reactions of the organism of the model, this class
| finds missing genes in the model and maps them to reactions to try and
| fill the gaps found with the BioCyc gene SmartTable.
|
| For specifications on the SmartTables see the attributes `biocyc_gene_tbl`
| & `biocyc_reacs_tbl`
.. note::
Please keep in mind that using this module requires a model containing the Genbank locus tags as labels.
If your model does not conform to this you can use one of the functions
:py:func:`~refinegems.curation.curate.polish_model` or
:py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table`.
Attributes:
- GapFiller Attributes:
All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
- biocyc_gene_tbl_path (str, required):
Path to organism-specific SmartTable for genes from BioCyc;
Should contain the columns: ``Accession-2 | Reactions of gene``
- biocyc_reacs_tbl_path (str, required):
Path to organism-specific SmartTable for reactions from BioCyc;
Should contain the columns:
``Reaction | Object ID | EC-Number | Spontaneous?``
- gff (str, required):
Path to organism-specific GFF file
"""
[docs]
def __init__(
self, biocyc_gene_tbl_path: str, biocyc_reacs_tbl_path: str, gff: str
) -> None:
super().__init__()
self.full_gene_list = biocyc_gene_tbl_path
self.biocyc_rxn_tbl = biocyc_reacs_tbl_path
self._gff = gff
self._variety = "BioCyc"
@property
def full_gene_list(self):
"""
| Get or set the current BioCyc Gene table.
| While setting the provided path for a TSV file from BioCyc with the
| columns ``'Accession-2' | 'Reactions of gene'`` is parsed and the
| content of the file is stored in a DataFrame containing all rows where
| a 'Reactions of gene' exists.
.. hint::
Please keep in mind that the column Accession-2 needs to contain Genbank locus tags. If that is not the case
for your organism use the correct column from BioCyc and rename it accordingly.
"""
return self._full_gene_list
@full_gene_list.setter
def full_gene_list(self, biocyc_gene_tbl_path: str):
# Read table
biocyc_genes = pd.read_table(
biocyc_gene_tbl_path,
usecols=["Accession-2", "Reactions of gene"],
dtype=str,
)
# Rename columns for further use
biocyc_genes.rename(
columns={"Accession-2": "locus_tag", "Reactions of gene": "id"},
inplace=True,
)
# Turn empty strings into None
biocyc_genes.replace(r"", None, inplace=True)
# Drop only complete empty rows
biocyc_genes.dropna(how="all", inplace=True)
# Statistics on full gene list based on BioCyc
self._statistics["genes"][f"total (based on {self._variety})"] = biocyc_genes[
"locus_tag"
].nunique() + int(self.missing_genes["locus_tag"].isna().sum())
self.full_gene_list = biocyc_genes
@property
def biocyc_rxn_tbl(self):
"""
| Get or set the current BioCyc Reaction table.
| While setting the provided path for a TSV file from BioCyc with the
| columns ``'Reaction' | 'Object ID' | 'EC-Number' | 'Spontaneous?'`` is
| parsed and the content of the file is stored in a DataFrame.
"""
return self._biocyc_rxn_tbl
@biocyc_rxn_tbl.setter
def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
# Read table
biocyc_rxns = pd.read_table(
biocyc_reacs_tbl_path,
usecols=["Reaction", "Object ID", "EC-Number", "Spontaneous?"],
dtype=str,
)
# Rename columns for further use
biocyc_rxns.rename(
columns={
"Reaction": "equation",
"Object ID": "id",
"EC-Number": "ec-code",
"Spontaneous?": "is_spontaneous",
},
inplace=True,
)
# Turn empty strings into NaNs
biocyc_rxns.replace(r"", None, inplace=True)
# Set entries in is_spontaneous to booleans &
# specify empty entries in 'is_spontaneous' as False
biocyc_rxns["is_spontaneous"].replace({r"T": True, r"F": False}, inplace=True)
biocyc_rxns["is_spontaneous"] = biocyc_rxns["is_spontaneous"].fillna(False)
self._biocyc_rxn_tbl = biocyc_rxns
[docs]
def find_missing_genes(self, model: libModel):
"""Retrieves the missing genes and reactions from the BioCyc table
according to the 'Accession-2' identifiers (locus_tags)
Args:
- model (libModel):
Model loaded with libSBML
"""
# Step 1: get genes from model
# ----------------------------
geneps_in_model = [
_.getLabel() for _ in model.getPlugin(0).getListOfGeneProducts()
]
# Step 2: Get genes of organism from BioCyc
# -----------------------------------------
# See self._biocyc_gene_tbl
# Step 3: BioCyc vs. model genes -> get missing genes for model
# -------------------------------------------------------------
self.missing_genes = self.biocyc_gene_tbl[
~self.biocyc_gene_tbl["locus_tag"].isin(geneps_in_model)
]
# Step 4: Get amount of missing genes in total
# --------------------------------------------
self._statistics["genes"]["missing (total)"] = self.missing_genes[
"locus_tag"
].nunique()
# Step 5: Filter results
# ----------------------
# Save not mappable genes due to no reaction ID
self.manual_curation["genes"]["no reaction ID"] = self.missing_genes[
self.missing_genes["id"].isnull()
]
# Add amount of unmappable genes to statistics
self._statistics["genes"]["unmappable"] = self.manual_curation["genes"][
"no reaction ID"
]["locus_tag"].nunique()
# Remove all rows where 'id' is None
self.missing_genes.dropna(subset="id", inplace=True)
# Step 6: Get ncbiprotein IDs
# ---------------------------
# Parse GFF file to obtain locus_tag2ncbiportein mapping for all CDS
locus_tag2ncbiprotein_df = parse_gff_for_cds(
self._gff,
{"locus_tag": "locus_tag", "protein_id": "ncbiprotein", "product": "name"},
)
locus_tag2ncbiprotein_df = locus_tag2ncbiprotein_df.explode("ncbiprotein")
locus_tag2ncbiprotein_df = locus_tag2ncbiprotein_df.explode("name")
# Get the complete missing genes dataframe with the ncbiprotein IDs
self.missing_genes = self.missing_genes.merge(
locus_tag2ncbiprotein_df, on="locus_tag"
)
# Step 7: Get amount of missing genes in total
# --------------------------------------------
self._statistics["genes"]["missing (mappable)"] = self.missing_genes[
"locus_tag"
].nunique()
self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
"unmappable"
]
[docs]
def find_missing_reactions(self, model: cobra.Model):
"""Retrieves the missing reactions with more information like the
equation, EC code, etc. according to the missing genes
Args:
- model (cobra.Model):
Model loaded with COBRApy
"""
# Step 1: filter missing gene list + extract ECs
# ----------------------------------------------
# Drop locus tag column as not needed for here
missing_genes = self.missing_genes.drop(["locus_tag", "name"], axis=1)
# Expand missing genes result table to merge with Biocyc reactions table
missing_genes = pd.DataFrame(
missing_genes["id"].str.split(r"//").tolist(),
index=missing_genes["ncbiprotein"],
).stack()
missing_genes = missing_genes.reset_index([0, "ncbiprotein"])
missing_genes.columns = ["ncbiprotein", "id"]
missing_genes["id"] = missing_genes["id"].str.strip()
# Turn ncbiprotein column into lists of ncbiprotein IDs per reaction
ncbiprotein_as_list = (
missing_genes.groupby("id")["ncbiprotein"]
.apply(list)
.reset_index(name="ncbiprotein")
)
missing_genes.drop("ncbiprotein", axis=1, inplace=True)
missing_genes = ncbiprotein_as_list.merge(missing_genes, on="id")
missing_genes["ncbiprotein"] = missing_genes["ncbiprotein"].apply(
lambda x: x if not None in x else list(filter(None, x))
)
# Drop duplicates to get the unique BioCyc IDs
missing_genes.drop_duplicates(subset="id", inplace=True)
# Get missing reactions from missing genes
self.missing_reactions = missing_genes.merge(self.biocyc_rxn_tbl, on="id")
# Turn ec-code entries with '//' into lists, remove prefix 'EC-' & get unique ec-codes
self.missing_reactions["ec-code"] = (
self.missing_reactions["ec-code"]
.str.replace(r"EC-", r"")
.str.split(r"\s*//\s*")
)
self.missing_reactions["ec-code"] = (
self.missing_reactions["ec-code"]
.fillna({i: [] for i in self.missing_reactions.index})
.map(set)
.map(list)
)
# Add 'G_spontaneous' as gene product if marked as spontaneous &
# drop is_spontaneous column
self.missing_reactions["ncbiprotein"] = self.missing_reactions.apply(
lambda x: (
x["ncbiprotein"]
if not x["is_spontaneous"]
else x["ncbiprotein"].append("spontaneous")
),
axis=1,
)
self.missing_reactions.drop("is_spontaneous", axis=1, inplace=True)
# Step 2: Get amount of missing reactions from BioCyc for statistics
# ------------------------------------------------------------------
self._statistics["reactions"]["missing (based on genes)"] = (
self.missing_reactions["id"].nunique()
)
# Step 3: Map BioCyc to model reactions & cleanup
# -----------------------------------------------
# Add column 'via'
self.missing_reactions["via"] = "BioCyc"
# Filter reacs for already in model
self.missing_reactions["add_to_GPR"] = self.missing_reactions.apply(
lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
axis=1,
)
# Add column 'reference'
self.missing_reactions["reference"] = None
# Drop rows if 'id' is NaN
self.manual_curation["reactions"]["no ID"] = self.missing_reactions[
self.missing_reactions["id"].isnull()
]
self._statistics["reactions"]["unmappable"] = int(
self.manual_curation["reactions"]["no ID"]["id"].isna().sum()
)
self.missing_reactions = self.missing_reactions[
~self.missing_reactions["id"].isnull()
]
# check, if any automatic gapfilling is possible
if self.missing_reactions.empty:
logging.warning(
f"No missing reactions for the provided model {model.id} were found via {self._variety}."
)
return None
# Step 4: Map missing reactions without entries in column 'add_to_GPR'
# to other databases to get a parsable reaction equation
# --------------------------------------------------------------------
# Map to MetaNetX/BiGG
mapped_reacs = map_biocyc_to_reac(self.missing_reactions)
# Filter reacs for already in model
mapped_reacs["add_to_GPR"] = mapped_reacs.apply(
lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
axis=1,
)
# Step 5: Get results
# -------------------
# statistics on reactions already in model
self._statistics["reactions"]["already in model"] = mapped_reacs[
~mapped_reacs["add_to_GPR"].isnull()
]["id"].nunique()
# statistics on mapped reactions
self._statistics["reactions"]["mapped to MNX"] = mapped_reacs[
mapped_reacs["via"] == "MetaNetX"
]["id"].nunique()
self._statistics["reactions"]["mapped to BiGG"] = mapped_reacs[
mapped_reacs["via"] == "BiGG"
]["id"].nunique()
self._statistics["reactions"]["mapped to KEGG"] = mapped_reacs[
mapped_reacs["via"] == "KEGG"
]["id"].nunique()
# Split missing reactios based on entries in 'via' & 'add_to_GPR'
mask = (mapped_reacs["via"] == "BioCyc") & (mapped_reacs["add_to_GPR"].isnull())
# DataFrame with unmappable BioCyc IDs & No entries in 'add_to_GPR'
self.manual_curation["reactions"]["no mapping"] = mapped_reacs[mask]
self._statistics["reactions"]["unmappable"] += self.manual_curation[
"reactions"
]["no mapping"]["id"].nunique()
# calculate remaining statistics
self._statistics["reactions"]["missing (total)"] = (
self._statistics["reactions"]["missing (based on genes)"]
- self._statistics["reactions"]["already in model"]
)
self._statistics["reactions"]["missing (remaining)"] = self._statistics[
"reactions"
]["unmappable"]
# Mapped reactions
self.missing_reactions = mapped_reacs[~mask]
# -------------------------------------
# GapFilling with GFF and DMND database
# -------------------------------------
[docs]
class GeneGapFiller(GapFiller):
"""Find gaps in the model using the GFF file of the underlying genome
and a DMND database and optionally NCBI.
This gap filling approach tries to identify missing genes from the GFF file
and uses DIAMOND to run a blastp search for homologs against the DMND database
.. note::
Please keep in mind that using this module requires a model containing the Genbank locus tags as labels.
If your model does not conform to this you can use one of the functions
:py:func:`~refinegems.curation.curate.polish_model` or
:py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table`.
.. hint::
Files required for the swissprot approach can be downloaded with
:py:func:`~refinegems.utility.set_up.download_url`
Attributes:
- GapFiller Attributes:
All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
"""
GFF_COLS = {
"locus_tag": "locus_tag",
"eC_number": "ec-code",
"protein_id": "ncbiprotein",
} # :meta:
[docs]
def __init__(self) -> None:
super().__init__()
self._variety = "GFF"
[docs]
def find_missing_genes(self, gffpath: Union[str, Path], model: libModel):
"""Find missing genes by comparing the CDS regions written in the GFF
with the GeneProduct entities in the model.
Args:
- gffpath (Union[str, Path]):
Path to a GFF file (corresponding to the model).
- model (libModel):
The model loaded with libSBML.
"""
# get all CDS from gff
self.full_gene_list = parse_gff_for_cds(gffpath, self.GFF_COLS)
# Statistics on full gene list based on GFF
self._statistics["genes"][f"total (based on {self._variety})"] = (
self.full_gene_list["locus_tag"].nunique()
+ int(self.full_gene_list["locus_tag"].isna().sum())
)
# get all genes from model by locus tag
model_locustags = [
_f_gene(g.getLabel()) for g in model.getPlugin(0).getListOfGeneProducts()
]
# filter
self.missing_genes = self.full_gene_list.loc[
~self.full_gene_list["locus_tag"].isin(model_locustags)
]
# formatting
for col in self.GFF_COLS.values():
if col not in self.missing_genes.columns:
self.missing_genes[col] = None
# collect stats
self._statistics["genes"]["missing (total)"] = self.missing_genes[
"locus_tag"
].nunique() + int(self.missing_genes["locus_tag"].isna().sum())
# save genes with no locus tag for manual curation
if "ncbiprotein" in self.missing_genes.columns:
self.manual_curation["genes"]["gff no locus tag"] = self.missing_genes[
self.missing_genes["locus_tag"].isnull()
]["ncbiprotein"]
else:
self.missing_genes["ncbiprotein"] = None
# Statistics on unmappable genes
self._statistics["genes"]["unmappable"] = int(
self.missing_genes["locus_tag"].isna().sum()
) # no locus tag
# formatting
# ncbiprotein | locus_tag | ec-code
self.missing_genes = self.missing_genes[
~self.missing_genes["locus_tag"].isnull()
]
self.missing_genes = self.missing_genes.explode("ncbiprotein")
[docs]
def find_missing_reactions(
self,
model: cobra.Model,
# prefix for pseudo ncbiprotein ids
prefix: str = "refinegems",
type_db: Literal["swissprot", "user"] = "swissprot",
# database information
fasta: str = None,
dmnd_db: str = None,
map_db: str = None,
# SwissProt - NCBI params
mail: str = None,
check_NCBI: bool = False,
# other params
threshold_add_reacs: int = 5,
# further optional params for the mapping
**kwargs,
) -> None:
"""Find missing reactions in the model by blasting the missing genes
against the SwissProt database and mapping the results to EC/BRENDA.
Optionally, query the protein accession numbers against NCBI
(increases runtime significantly).
.. hint::
For more information on how to get the SwissProt files, please see
:py:func:`~refinegems.utility.set_up.download_url`.
Args:
- model (cobra.Model):
The model loaded with COBRApy.
- prefix (str, optional):
Prefix for gene IDs in the model, the have to be generated
randomly, as no ID from the chosen namespace (usually NCBI protein)
has been found. Defaults to 'refinegems'.
- mail (str, optional):
Mail address for the query against NCBI.
If not set, skips the mapping.
Defaults to None.
- check_NCBI (bool, optional):
If set to True, checking the gene IDs / NCBI protein IDs against the
NCBI database is enabled. Else, this step is skipped to reduce runtime.
Only usable with SwissProt as database.
Defaults to False.
- fasta (str, optional):
The protein FASTA file of the organism the model was build on.
Required for the searchh against SwissProt.
Defaults to None.
- type_db (Literal['swissprot','user'], optional):
Database to search against.
Choose 'swissprot' for SwissProt or 'user' for a user defined database.
Defaults to 'swissprot'.
- dmnd_db (str, optional):
Path to the DIAMOND database containing the protein sequences of SwissProt.
Required for the search against SwissProt or rhe users own DIAMOND dn.
Defaults to None.
- map_db (str, optional):
Path to the SwissProt mapping file.
Required for the search against SwissProt.
Greatly decreases runtime for running the DIAMOND search.
..note::
The mapping depends on the chosen database.
Defaults to None.
- threshold_add_reacs (int, optional):
Threshold for the amount of reactions to add to the model.
Defaults to 5.
- **kwargs:
Further optional parameters for the mapping,
e.g. outdir, sens, cov, t, pid, etc.
For more information see :py:func:`refinegems.utility.db_access.map_to_homologs`
in case of type_db = 'user' or :py:func:`refinegems.utility.db_access.get_ec_via_swissprot`
in case of type_db = 'swissprot'.
"""
self._variety = f"{self._variety} + {type_db}"
# try to identfy missing ECs
# --------------------------
case_1 = self.missing_genes[self.missing_genes["ec-code"].isnull()]
not_case_1 = self.missing_genes[~self.missing_genes["ec-code"].isnull()]
if len(case_1) > 0:
# BLAST against a dmnd to retrieve EC numbers
# +++++++++++++++++++++++++++++++++++++++++++
match type_db:
# type_db = swissprot: BLAST against SwissProt
# -> BLAST (DIAMOND) against SwissProt to get EC/BRENDA
case "swissprot":
if fasta and dmnd_db and map_db:
case_1_mapped = get_ec_via_swissprot(
fasta, dmnd_db, case_1, map_db, **kwargs
) # further optional params for the mapping
case_1.drop("ec-code", inplace=True, axis=1)
case_1 = case_1.merge(case_1_mapped, on="locus_tag", how="left")
not_case_1["UniProt"] = None
# still no EC but ncbiprotein
# -> access ncbi for ec (optional)
# @DEBUG .......................
# mapped_reacs = mapped_reacs.iloc[300:350,:]
# print(UserWarning('Running in debugging mode.'))
# ..............................
if check_NCBI and mail:
mapped_reacs["ec-code"] = mapped_reacs.progress_apply(
lambda x: (
get_ec_from_ncbi(mail, x["ncbiprotein"])
if not x["ec-code"]
and not x["ncbiprotein"].isnull()
else x["ec-code"]
),
axis=1,
)
# type_db = user: BLAST against user defined database
case "user":
if fasta and dmnd_db:
case_1 = map_to_homologs(
fasta, dmnd_db, case_1, map_db, email=mail, **kwargs
)
case _:
raise ValueError(
"Please choose one of the valid database types 'swissprot' or 'user' for the GeneGapFiller."
)
# no ncbiprotein, no EC
self.manual_curation["genes"]["no ncbiprotein, no EC"] = case_1[
case_1["ncbiprotein"].isnull() & case_1["ec-code"].isnull()
]
self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
"no ncbiprotein, no EC"
]["locus_tag"].nunique()
# combine with the rest
mapped_reacs = pd.concat(
[
case_1[~(case_1["ncbiprotein"].isnull() & case_1["ec-code"].isnull())],
not_case_1,
]
)
# convert NaNs to None
mapped_reacs.mask(mapped_reacs.isnull(), other=None, inplace=True)
# save entries with no EC for manual curation
self.manual_curation["genes"]["no EC"] = mapped_reacs[
mapped_reacs["ec-code"].isnull()
]
self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
"no EC"
]["locus_tag"].nunique()
mapped_reacs = mapped_reacs[~mapped_reacs["ec-code"].isnull()]
# check, if any automatic gapfilling is still possible
if mapped_reacs.empty:
logging.warning(
f"No missing reactions for the provided model {model.id} were found via {self._variety}."
)
return None
# create pseudoids for entries with no ncbiprotein id
mapped_reacs["ncbiprotein"] = mapped_reacs.apply(
lambda x: f'{x["locus_tag"]}' if not x["ncbiprotein"] else x["ncbiprotein"],
axis=1,
)
# EC found
# ---------
# update the gene information
updated_missing_genes = mapped_reacs.copy()
# reformat missing reacs
if type_db == "swissprot":
mapped_reacs.drop(["locus_tag"], inplace=True, axis=1)
# transform table into EC-number vs. list of NCBI protein IDs
eccode = (
mapped_reacs["ec-code"]
.apply(pd.Series)
.reset_index()
.melt(id_vars="index")
.dropna()[["index", "value"]]
.set_index("index")
)
ncbiprot = (
mapped_reacs["ncbiprotein"]
.apply(pd.Series)
.reset_index()
.melt(id_vars="index")
.dropna()[["index", "value"]]
.set_index("index")
)
mapped_reacs = pd.merge(
eccode, ncbiprot, left_index=True, right_index=True
).rename(columns={"value_x": "ec-code", "value_y": "ncbiprotein"})
mapped_reacs = (
mapped_reacs.groupby(mapped_reacs["ec-code"])
.aggregate({"ncbiprotein": "unique"})
.reset_index()
)
# map EC to reactions
mapped_reacs = map_ec_to_reac(
mapped_reacs[["ec-code", "ncbiprotein"]], threshold_add_reacs
)
self._statistics["reactions"]["missing (based on genes)"] = mapped_reacs[
"id"
].nunique()
# save for manual curation
self.manual_curation["reactions"]["no mapping"] = mapped_reacs[
mapped_reacs["id"].isnull()
]
self._statistics["reactions"]["unmappable"] = int(
self.manual_curation["reactions"]["no mapping"]["id"].isna().sum()
)
# map to model
mapped_reacs = mapped_reacs[~mapped_reacs["id"].isnull()]
mapped_reacs["add_to_GPR"] = mapped_reacs.apply(
lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
axis=1,
)
# Track all remaining statistics
# statistics on reactions already in model
self._statistics["reactions"]["already in model"] = mapped_reacs[
~mapped_reacs["add_to_GPR"].isnull()
]["id"].nunique()
# statistics on mapped reactions
self._statistics["reactions"]["mapped to MNX"] = mapped_reacs[
mapped_reacs["add_to_GPR"].isnull()
][mapped_reacs["via"] == "MetaNetX"]["id"].nunique()
self._statistics["reactions"]["mapped to BiGG"] = mapped_reacs[
mapped_reacs["add_to_GPR"].isnull()
][mapped_reacs["via"] == "BiGG"]["id"].nunique()
self._statistics["reactions"]["mapped to KEGG"] = mapped_reacs[
mapped_reacs["add_to_GPR"].isnull()
][mapped_reacs["via"] == "KEGG"]["id"].nunique()
# calculate remaining statistics
self._statistics["reactions"]["missing (total)"] = (
self._statistics["reactions"]["missing (based on genes)"]
- self._statistics["reactions"]["already in model"]
)
self._statistics["reactions"]["missing (remaining)"] = self._statistics[
"reactions"
]["unmappable"]
self._statistics["genes"]["missing (mappable)"] = updated_missing_genes[
"locus_tag"
].nunique()
self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
"unmappable"
]
# update attributes
if type_db == "user":
updated_missing_genes = updated_missing_genes.drop(
columns=["locus_tag_ref", "old_locus_tag"], axis=1
)
self.missing_genes = updated_missing_genes
self.missing_reactions = mapped_reacs
############################################################################
# functions (2)
############################################################################
# Gap-filling via medium/media
# ----------------------------
[docs]
def single_cobra_gapfill(
model: cobra.Model,
universal: cobra.Model,
medium: Medium,
namespace: Literal["BiGG"] = "BiGG",
growth_threshold: float = 0.05,
) -> Union[list[str], bool]:
"""Attempt gapfilling (with COBRApy) for a given model to allow growth on a given
medium.
Args:
- model (cobra.Model):
The model to perform gapfilling on.
- universal (cobra.Model):
A model with reactions to be potentially used for the gapfilling.
- medium (Medium):
A medium the model should grow on.
- namespace (Literal['BiGG'], optional): Namespace to use for the model.
Defaults to 'BiGG'.
- growth_threshold (float, optional): Minimal rate for the model to be considered growing.
Defaults to 0.05.
Returns:
Union[list[str],True]:
List of reactions to be added to the model to allow growth
or True, if the model already grows.
"""
# perform the gapfilling
solution = []
with model as model_copy:
# set medium model should grow on
medium_to_model(model_copy, medium, namespace, double_o2=False)
# if model does not show growth (depending on threshold), perform gapfilling
if model_copy.optimize().objective_value < growth_threshold:
try:
solution = cobra.flux_analysis.gapfill(
model_copy,
universal,
lower_bound=growth_threshold,
demand_reactions=False,
)
except cobra.exceptions.Infeasible:
warnings.warn(
f"Gapfilling for medium {medium.name} failed. Manual curation required."
)
else:
print(
f"Model already grows on medium {medium.name} with objective value of {model_copy.optimize().objective_value}"
)
return True
return solution
[docs]
def cobra_gapfill_wrapper(
model: cobra.Model,
universal: cobra.Model,
medium: Medium,
namespace: Literal["BiGG"] = "BiGG",
growth_threshold: float = 0.05,
iterations: int = 3,
chunk_size: int = 10000,
) -> cobra.Model:
"""Wrapper for :py:func:`~refinegems.classes.gapfill.single_cobra_gapfill`.
Either use the full set of reactions in universal model by setting iteration to
0 or None or use them in randomized chunks for faster runtime (useful
on laptops). Note: when using the second option, be aware that this does not
test all reaction combinations exhaustively (heuristic approach!!!).
Args:
- model (cobra.Model):
The model to perform gapfilling on.
- universal (cobra.Model):
A model with reactions to be potentially used for the gapfilling.
- medium (Medium):
A medium the model should grow on.
- namespace (Literal['BiGG'], optional):
Namespace to use for the model.
Options include 'BiGG'.
Defaults to 'BiGG'.
- growth_threshold (float, optional):
Growth threshold for the gapfilling.
Defaults to 0.05.
- iterations (int, optional):
Number of iterations for the heuristic version of the gapfilling.
If 0 or None is given, uses full set of reactions.
Defaults to 3.
- chunk_size (int, optional):
Number of reactions to be used for gapfilling at the same time.
If None or 0 is given, use full set, not heuristic.
Defaults to 10000.
Returns:
cobra.Model:
The gapfilled model, if a solution was found.
"""
solution = []
# run a heuristic approach:
# for a given number of iterations, use a subset (chunk_size) of
# reactions for the gapfilling
if (iterations and iterations > 0) and (chunk_size and chunk_size > 0):
num_reac = len(model.reactions)
# for each iteration
for i in range(iterations):
not_used = [_ for _ in range(0, num_reac)]
# divide reactions in random subsets
for chunk in range(math.ceil(num_reac / chunk_size)):
if len(not_used) > chunk_size:
rng = np.random.default_rng()
reac_set = rng.choice(
not_used, size=chunk_size, replace=False, shuffle=False
)
not_used = [_ for _ in not_used if _ not in reac_set]
else:
reac_set = not_used
# get subset of reactions
subset_reac = cobra.Model("subset_reac")
for n in reac_set:
subset_reac.add_reactions([universal.reactions[n].copy()])
# gapfilling
solution = single_cobra_gapfill(
model, subset_reac, medium, namespace, growth_threshold
)
if (isinstance(solution, bool) and solution) or (
isinstance(solution, list) and len(solution) > 0
):
break
if (isinstance(solution, bool) and solution) or (
isinstance(solution, list) and len(solution) > 0
):
break
# use the whole reactions content of the universal model at once
# not advised for Laptops and PCs with small computing power
# may take a long time
else:
solution = single_cobra_gapfill(
model, universal, medium, namespace, growth_threshold
)
# if solution is found add new reactions to model
if isinstance(solution, list) and len(solution) > 0:
for reac in solution[0]:
reac.notes["creation"] = "via gapfilling"
print(
f"Adding {len(solution[0])} reactions to model to ensure growth on medium {medium.name}."
)
model.add_reactions(solution[0])
return model
[docs]
def multiple_cobra_gapfill(
model: cobra.Model,
universal: cobra.Model,
media_list: list[Medium],
namespace: Literal["BiGG"] = "BiGG",
growth_threshold: float = 0.05,
iterations: int = 3,
chunk_size: int = 10000,
) -> cobra.Model:
"""Perform :py:func:`~refinegems.classes.gapfill.cleanup.single_cobra_gapfill` on a list of media.
Args:
- model (cobra.Model):
The model to be gapfilled.
- universal (cobra.Model):
The model to use reactions for gapfilling from.
- media_list (list[Medium]):
List ofmedia the model is supposed to grow on.
- growth_threshold (float, optional):
Growth threshold for the gapfilling.
Defaults to 0.05.
- iterations (int, optional):
Number of iterations for the heuristic version of the gapfilling.
If 0 or None is given, uses full set of reactions.
Defaults to 3.
- chunk_size (int, optional):
Number of reactions to be used for gapfilling at the same time.
If None or 0 is given, use full set, not heuristic.
Defaults to 10000.
Returns:
cobra.Model:
The gapfilled model, if a solution was found.
"""
for medium in media_list:
model = cobra_gapfill_wrapper(
model,
universal,
medium,
namespace,
iterations,
chunk_size,
growth_threshold,
)
return model