Source code for refinegems.utility.entities

#!/usr/bin/env python
"""Collection of functions to access, handle and manipulate different entities of
COBRApy and libsbml models.
"""

__author__ = "Famke Baeuerle and Gwendolyn O. Döbel and Carolin Brune"

################################################################################
# requirements
################################################################################
import cobra
from cobra.io.sbml import _f_gene_rev
import json
import logging
import pandas as pd
import re
import requests
import urllib
import warnings

from Bio import Entrez
from Bio.KEGG import REST, Compound
from cobra.io.sbml import _f_gene
from datetime import date
from functools import reduce
from libsbml import Model as libModel
from libsbml import (
    Species,
    Reaction,
    FbcOr,
    FbcAnd,
    GeneProductRef,
    Unit,
    UnitDefinition,
    ListOfUnitDefinitions,
)
from libsbml import UNIT_KIND_MOLE, UNIT_KIND_GRAM, UNIT_KIND_LITRE, UNIT_KIND_SECOND
from libsbml import BQM_IS, BQM_IS_DERIVED_FROM, BQM_IS_DESCRIBED_BY
from pathlib import Path

from random import choice
from string import ascii_uppercase, digits
from typing import Union, Literal
from tqdm import tqdm

tqdm.pandas()
pd.options.mode.chained_assignment = None  # suppresses the pandas SettingWithCopyWarning; comment out before developing!!

from .cvterms import (
    add_cv_term_genes,
    add_cv_term_metabolites,
    add_cv_term_reactions,
    add_cv_term_units,
    _add_annotations_from_dict_cobra,
)
from .db_access import (
    kegg_reaction_parser,
    _add_annotations_from_bigg_reac_row,
    get_BiGG_metabs_annot_via_dbid,
    add_annotations_from_BiGG_metabs,
    _search_ncbi_for_gp,
)
from .io import load_a_table_from_database, parse_gff_for_cds
from .util import (
    COMP_MAPPING,
    DB2REGEX,
    VALID_COMPARTMENTS,
    MIN_GROWTH_THRESHOLD,
    is_stoichiometric_factor,
)
from ..developement.decorators import *

################################################################################
# variables
################################################################################

REF_COL_GF_GENE_MAP = {"UniProt": "UNIPROT", "GeneID": "NCBIGENE"}  #: :meta:

################################################################################
# functions
################################################################################

# ++++++++++++++++++++++++++++++++++++++++
# COBRApy - models
# ++++++++++++++++++++++++++++++++++++++++

# handling compartments
# ---------------------



[docs]
def are_compartment_names_valid(model: cobra.Model) -> bool:
    """Check if compartment names of model are considered valid based on
    :py:const:`~refinegems.utility.util.VALID_COMPARTMENTS`.

    Args:
        - model (cobra.Model):
            The model, loaded with COBRApy.

    Returns:
        bool:
            True, if valid, else false.
    """

    for c in model.compartments.keys():
        if c not in VALID_COMPARTMENTS.keys():
            return False

    return True




[docs]
def resolve_compartment_names(model: cobra.Model):
    """Resolves compartment naming problems.

    Args:
        - model (cobra.Model):
            A COBRApy model object.

    Raises:
        - KeyError: Unknown compartment raises an error to add it to the mapping. Important for developers.
    """

    # check if compartment names are valid
    if not are_compartment_names_valid(model):

        # check if mapping is possible
        if set(model.compartments.keys()).issubset(set(COMP_MAPPING.keys())):
            # for each metabolite rename the compartment
            for metabolite in model.metabolites:
                metabolite.compartment = COMP_MAPPING[metabolite.compartment]
            # add whole descriptions of the compartments to the model
            # note:
            #    only compartments IN the model will be added
            model.compartments = VALID_COMPARTMENTS
            if "uc" in model.compartments.keys():
                logging.warning("Unknown compartment(s) detected and (re)named 'uc'. Simulation results might be affected.")

        else:
            raise KeyError(
                f"Unknown compartment {[_ for _ in model.compartments if _ not in COMP_MAPPING.keys()]} detected. Please contact developers or change compartment names manually."
            )



# handling cobra entities (features)
# ----------------------------------



[docs]
def reaction_equation_to_dict(eq: str, model: cobra.Model) -> dict:
    """Parses a reaction equation string to dictionary

    Args:
        - eq (str):
            Equation of a reaction
        - model (cobra.Model):
            Model loaded with COBRApy

    Returns:
        dict:
            Metabolite Ids as keys and their coefficients as values (negative = reactants, positive = products)
    """
    # from Alina Renz
    eq = eq.split(" ")
    eq_matrix = {}
    are_products = False
    coeff = 1
    for i, part in enumerate(eq):
        if part == "-->":
            are_products = True
            continue
        if part == "+":
            continue
        if part == "2":
            coeff = 2
            continue
        if are_products:
            eq_matrix[model.metabolites.get_by_id(part)] = 1 * coeff
            coeff = 1
        else:
            eq_matrix[model.metabolites.get_by_id(part)] = -1 * coeff
            coeff = 1
    return eq_matrix




[docs]
def get_reaction_annotation_dict(
    model: cobra.Model, db: Literal["KEGG", "BiGG"]
) -> dict:
    """Create a dictionary of a model's reaction IDs and a chosen database ID as
    saved in the annotations of the model.

    The database ID can be choosen based on the strings for the namespace options
    in other functions.

    Args:
        - model (cobra.Model):
            A model loaded with COBRApy.
        - db (Literal['KEGG','BiGG']):
            The string denoting the database to map to.

    Raises:
        - ValueError: Unknown database string for paramezer db

    Returns:
        dict:
            The mapping of the reaction IDs to the database IDs found in the annotations
    """

    react_dict = {}

    match db:
        case "KEGG":
            db_string = "kegg.reaction"
        case "BiGG":
            db_string = "bigg.reaction"
        case _:
            mes = f"Unknown database string for parameter db: {db}"
            raise ValueError(mes)

    for r in model.reactions:
        if db_string in r.annotation.keys():
            react_dict[r.id] = r.annotation[db_string]
        else:
            react_dict[r.id] = "-"

    return react_dict




[docs]
def create_random_id(
    model: cobra.Model, entity_type: Literal["reac", "meta"] = "reac", prefix: str = ""
) -> str:
    """Generate a unique, random ID for a model entity for a model.

    Args:
        - model (cobra.Model):
            A model loaded with COBRApy.
        - entity_type (Literal['reac','meta'], optional):
            Type of model entity.
            Can be 'reac' for Reaction or 'meta' for Metabolite.
            Defaults to 'reac'.
        - prefix (str, optional):
            Prefix to set for the randomised part.
            Useful to identify the random IDs later on.
            Defaults to ''.

    Raises:
        - ValueError: Unknown entity_type

    Returns:
        str:
            The generate new and unique ID.
    """

    match entity_type:
        case "reac":
            all_ids = [_.id for _ in model.reactions]
        case "meta":
            all_ids = [_.id for _ in model.metabolites]
        case _:
            mes = f"Unkown entity_type: {entity_type}"
            raise ValueError(mes)

    prefix = f"{prefix}{entity_type}"
    var = "".join(choice(ascii_uppercase + digits) for i in range(6))
    label = prefix + var
    j = 6

    while True:

        for i in range(36**6):  # make sure it does not run endlessly
            if label in all_ids:
                label = prefix + "".join(
                    choice(ascii_uppercase + digits) for x in range(j)
                )
            else:
                return label

        j = j + 1




[docs]
def match_id_to_namespace(
    model_entity: Union[cobra.Reaction, cobra.Metabolite], namespace: Literal["BiGG"]
) -> None:
    """Based on a given namespace, change the ID of a given model entity to the set namespace.

    Currently working namespaces:

        - BiGG

    Args:
        - model_entity (cobra.Reaction, cobra.Metabolite]):
            The model entity.
            Can be either a cobra.Reaction or cobra.Metabolite object.
        - namespace (Literal['BiGG']):
            The chosen namespace.

    Raises:
        - ValueError: Unknown input for namespace
        - TypeError: Unknown type for model_entity
    """

    match model_entity:

        # Reaction
        # --------
        case cobra.Reaction():
            match namespace:

                case "BiGG":
                    if "bigg.reaction" in model_entity.annotation.keys():
                        # currently takes first entry if annotation is list
                        model_entity.id = (
                            model_entity.annotation["bigg.reaction"]
                            if isinstance(model_entity.annotation["bigg.reaction"], str)
                            else model_entity.annotation["bigg.reaction"][0]
                        )

                case _:
                    mes = f"Unknown input for namespace: {namespace}"
                    raise ValueError(mes)

        # Metabolite
        # ----------
        case cobra.Metabolite():
            match namespace:

                case "BiGG":
                    if "bigg.metabolite" in model_entity.annotation.keys():
                        # currently takes first entry if annotation is list
                        model_entity.id = (
                            model_entity.annotation["bigg.metabolite"]
                            + "_"
                            + model_entity.compartment
                            if isinstance(
                                model_entity.annotation["bigg.metabolite"], str
                            )
                            else model_entity.annotation["bigg.metabolite"][0]
                        )

                case _:
                    mes = f"Unknown input for namespace: {namespace}"
                    raise ValueError(mes)
        # Error
        # -----
        case _:
            mes = f"Unknown type for model_entity: {type(model_entity)}"
            raise TypeError(mes)




[docs]
def isreaction_complete(
    reac: cobra.Reaction,
    name_check: bool = False,
    formula_check: Literal["none", "existence", "wildcard", "strict"] = "existence",
    exclude_dna: bool = True,
    exclude_rna: bool = True,
) -> bool:
    """Check, if a reaction object can be considered a complete reaction.
    The parameters can set the strictness of the checking.
    Useful for checking, if a reaction can be added to a model
    or if it might break it. For example, a missing model ID in either reaction or
    metabolites returns false.



    Args:
        - reac (cobra.Reaction):
            The reaction object (COBRApy) to test.
        - name_check (bool, optional):
            Option to force reaction and metabolites to have the name attribute set.
            Defaults to False.
        - formula_check (Literal['none','existence','wildcard','strict'], optional):
            Option to check the formula. 'none' disables the check,
            'existence' tests, if a formula is set, 'wildcard' additionally checks for wild cards
            (returns false if one found) and 'strict' also checks for the rest symbol 'R'.
            Defaults to 'existence'.
        - exclude_dna (bool, optional):
            Option to set DNA reactions to invalid. Defaults to True.
        - exclude_rna (bool, optional):
            Option to set RNA reaction to invalid. Defaults to True.

    Returns:
        bool:
            True, if the checks are passed successfully, else False.
    """

    # check reaction features
    # -----------------------
    # check name
    if name_check:
        if reac.id == "" or pd.isnull(reac.name):
            return False
    # check for RNA/DNA
    if exclude_dna and "DNA" in reac.name:
        return False
    if exclude_rna and "RNA" in reac.name:
        return False

    # check metabolites features
    # --------------------------
    for m in reac.metabolites:
        # check id
        if m.id == "" or pd.isnull(m.id):
            return False
        # check name
        if name_check and (m.name == "" or pd.isnull(m.name)):
            return False
        # check formula
        match formula_check:
            # case 1: no formula checking
            case "none":
                pass
            # case 2: check, if formula is set
            case "existence":
                if m.formula == "" or pd.isnull(m.formula):
                    return False
            # case 3: check, if formula is set and only contains alphanumerical characters
            case "wildcard":
                if m.formula == "" or pd.isnull(m.formula) or not m.formula.isalnum():
                    return False
            # case 4: check, if formula is set, constains only alphanumerical characters and no rest R
            case "strict":
                if (
                    m.formula == ""
                    or pd.isnull(m.formula)
                    or not m.formula.isalnum()
                    or bool(re.search(r"R(?![a-z])", "CH2HOROH"))
                ):
                    return False
            # default case: Warning + no formula check
            case _:
                mes = f"Unknown options for formula_check: {formula_check}\nChecking the metabolite formula will be skipped."
                warnings.warn(mes, UserWarning)

    return True



# adding metabolites to cobra models
# ----------------------------------



[docs]
@template
def build_metabolite_xxx(
    id: str, model: cobra.Model, namespace: str, compartment: str, idprefix: str
) -> cobra.Metabolite:
    """Template function for building a cobra.Metabolite.

    .. note::

        This is a template function for developers. It cannot be executed.

    Args:
        - id (str):
            _description_
        - model (cobra.Model):
            _description_
        - namespace (str):
            _description_
        - compartment (str):
            _description_
        - idprefix (str):
            _description_

    Returns:
        cobra.Metabolite:
            _description_
    """
    # change xxx to database or way the metabolite will be reconstructed with
    # check if id in model
    # get information via id
    # collection formation in a new metabolite object
    # add more annotations from other databases
    # adjust namespace
    # check model again for new namespace
    pass



# originally from SPECIMEN

[docs]
def build_metabolite_mnx(
    id: str,
    model: cobra.Model,
    namespace: str = "BiGG",
    compartment: str = "c",
    idprefix: str = "refineGEMs",
) -> Union[cobra.Metabolite, None]:
    """Build a cobra.Metabolite object from a MetaNetX ID.
    This function will NOT directly add the metabolite to the model,
    if the contruction is successful.

    Args:
        - id (str):
            A MetaNetX ID of a metabolite.
        - model (cobra.Model):
            The model, the metabolite will be build for.
        - namespace (str, optional):
            Name to use for the model ID. If namespace cannot be matched,
            will use a random ID.
            Defaults to 'BiGG'.
        - compartment (str, optional):
            Compartment of the metabolite. Defaults to 'c'.
        - idprefix (str, optional):
            Prefix for the random ID. Defaults to 'refineGEMs'.

    Returns:
        1. Case construction successful or match found in model:
                cobra.Metabolite:
                    The metabolite object.

        2. Case construction failed:
                None:
                    Nothing to return.
    """

    # fast check if compound already in model
    # ------------------------------------------
    # step 1: check if MetaNetX ID in model
    matches = [
        x.id
        for x in model.metabolites
        if "metanetx.chemical" in x.annotation
        and x.annotation["metanetx.chemical"] == id
        and x.compartment == compartment
    ]

    # step 2: if yes, retrieve metabolite from model
    # case 1: multiple matches found
    if len(matches) > 0:
        if len(matches) > 1:
            # currently, just the first match is taken
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # if not, create new metabolite
    # -----------------------------
    metabolite_prop = load_a_table_from_database(
        f"SELECT * FROM mnx_chem_prop WHERE id = '{id}'"
    )
    metabolite_anno = load_a_table_from_database(
        f"SELECT * FROM mnx_chem_xref WHERE id = '{id}'"
    )
    if len(metabolite_prop) == 0:  # cannot construct metabolite
        return None
    else:

        # step 1: create a random metabolite ID
        new_metabolite = cobra.Metabolite(create_random_id(model, "meta", idprefix))

        # step 2: add features
        # --------------------
        new_metabolite.formula = metabolite_prop["formula"].iloc[0]
        new_metabolite.name = metabolite_prop["name"].iloc[0]
        new_metabolite.charge = metabolite_prop["charge"].iloc[0]
        new_metabolite.compartment = compartment

        # step 3: add notes
        # -----------------
        new_metabolite.notes["created with"] = "refineGEMs based on metanetx.chemical"

        # step 4: add annotations
        # -----------------------
        # add SBOTerm
        new_metabolite.annotation["sbo"] = "SBO:0000247"

        # add information directly available from the mnx_chem_prop table
        new_metabolite.annotation["metanetx.chemical"] = [metabolite_prop["id"].iloc[0]]
        if not pd.isnull(metabolite_prop["InChIKey"].iloc[0]):
            new_metabolite.annotation["inchikey"] = (
                metabolite_prop["InChIKey"].iloc[0].split("=")[1]
            )

        # get more annotation from the mnx_chem_xref table
        for db in [
            "kegg.compound",
            "metacyc.compound",
            "seed.compound",
            "bigg.metabolite",
            "chebi",
        ]:
            db_matches = metabolite_anno[metabolite_anno["source"].str.contains(db)]
            if len(db_matches) > 0:
                new_metabolite.annotation[db] = [
                    m.split(":", 1)[1] for m in db_matches["source"].tolist()
                ]

        # Cleanup BiGG annotations
        if not "bigg.metabolite" in new_metabolite.annotation.keys():
            # if no BiGG was found in MetaNetX, try reverse search in BiGG
            get_BiGG_metabs_annot_via_dbid(
                new_metabolite, id, "MetaNetX (MNX) Chemical", compartment
            )

            # add additional information from BiGG (if ID found)
            add_annotations_from_BiGG_metabs(new_metabolite)

        # step 5: change ID according to namespace
        # ----------------------------------------
        match_id_to_namespace(new_metabolite, namespace)

        # step 6: re-check existence of ID in model
        # -----------------------------------------
        if new_metabolite.id in [_.id for _ in model.metabolites]:
            return model.metabolites.get_by_id(new_metabolite.id)

    return new_metabolite



# originally from SPECIMEN

[docs]
def build_metabolite_kegg(
    kegg_id: str,
    model: cobra.Model,
    namespace: Literal["BiGG"] = "BiGG",
    compartment: str = "c",
    idprefix="refineGEMs",
) -> Union[cobra.Metabolite, None]:
    """Build a cobra.Metabolite object from a KEGG ID.
    This function will NOT directly add the metabolite to the model,
    if the contruction is successful.

    Args:
        - kegg_id (str):
            A KEGG ID of a metabolite.
        - model (cobra.Model):
            The model, the metabolite will be build for.
        - namespace (str, optional):
            Name to use for the model ID. If namespace cannot be matched,
            will use a random ID.
            Defaults to 'BiGG'.
        - compartment (str, optional):
            Compartment of the metabolite. Defaults to 'c'.
        - idprefix (str, optional):
            Prefix for the random ID. Defaults to 'refineGEMs'.

    Returns:
        1. Case construction successful or match found in model:
                cobra.Metabolite:
                    The build metabolite object.

        2. Case construction failed:
                None:
                    Nothing to return.
    """

    # ---------------------------------------
    # fast check if compound already in model
    # ---------------------------------------
    # step 1: check via KEGG ID
    matches = [
        x.id
        for x in model.metabolites
        if (
            "kegg.compound" in x.annotation and x.annotation["kegg.compound"] == kegg_id
        )
    ]
    if len(matches) > 0:
        # step 2: model id --> metabolite object
        #  case 1: multiple matches found
        if len(matches) > 1:
            # Currently takes only the first match
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # -----------------------------
    # if not, create new metabolite
    # -----------------------------

    # step 1: retrieve KEGG entry for compound
    # ----------------------------------------
    try:
        kegg_handle = REST.kegg_get(kegg_id)
        kegg_record = [r for r in Compound.parse(kegg_handle)][0]
    except urllib.error.HTTPError:
        warnings.warn(f"HTTPError: {kegg_id}")
        return None
    except ConnectionResetError:
        warnings.warn(f"ConnectionResetError: {kegg_id}")
        return None
    except urllib.error.URLError:
        warnings.warn(f"URLError: {kegg_id}")
        return None

    # step 2: create a random metabolite ID
    # -------------------------------------
    new_metabolite = cobra.Metabolite(create_random_id(model, "meta", idprefix))

    # step 3: add features
    # --------------------
    # set name from KEGG and additionally use it as ID if there is none yet
    if isinstance(kegg_record.name, list) and len(kegg_record.name) > 0:
        new_metabolite.name = kegg_record.name[0]
    elif isinstance(kegg_record.name, str) and len(kegg_record.name) > 0:
        new_metabolite.name = kegg_record.name
    else:
        new_metabolite.name = ""
    # set compartment
    new_metabolite.compartment = compartment
    # set formula
    new_metabolite.formula = kegg_record.formula

    # step 4: add notes
    # -----------------
    new_metabolite.notes["created with"] = "refineGEMs based on KEGG.compound"

    # step 5: add annotations
    # -----------------------
    # add annotation from the KEGG entry
    new_metabolite.annotation["kegg.compound"] = kegg_id
    db_idtf = {"CAS": "cas", "PubChem": "pubchem.compound", "ChEBI": "chebi"}
    for db, ids in kegg_record.dblinks:
        if db in db_idtf:
            new_metabolite.annotation[db_idtf[db]] = ids

    # add SBOTerm
    new_metabolite.annotation["sbo"] = "SBO:0000247"

    # search for infos in MetaNetX
    mnx_info = load_a_table_from_database(
        f"SELECT * FROM mnx_chem_xref WHERE source = 'kegg.compound:{kegg_id}'",
        query=True,
    )
    # if matches have been found
    if len(mnx_info) > 0:
        mnx_ids = list(set(mnx_info["id"]))
        
        # in case of an ambiguous mapping, take first match
        if len(mnx_ids) > 1:
            logging.warning('MNX mapping ambiguously due to multiple matches: {mnx_ids}\n-> Using first entry for building metabolite.')
            mnx_ids = [mnx_ids[0]]
        
        # mapping is unambiguously (or made that way)
        mnx_info = load_a_table_from_database(
            f"SELECT * FROM mnx_chem_prop WHERE id = '{mnx_ids[0]}'", query=True
        )
        # add charge
        new_metabolite.charge = mnx_info["charge"].iloc[0]
        # add more annotations
        new_metabolite.annotation["metanetx.chemical"] = [mnx_info["id"].iloc[0]]
        if not pd.isnull(mnx_info["InChIKey"].iloc[0]):
            new_metabolite.annotation["inchikey"] = (
                mnx_info["InChIKey"].iloc[0].split("=")[1]
            )

        # get more annotation from the mnx_chem_xref table
        metabolite_anno = load_a_table_from_database(
            f'SELECT * FROM mnx_chem_xref WHERE id = \'{mnx_info["id"]}\''
        )
        for db in [
            "kegg.compound",
            "metacyc.compound",
            "seed.compound",
            "bigg.metabolite",
            "chebi",
        ]:
            db_matches = metabolite_anno[metabolite_anno["source"].str.contains(db)]
            if len(db_matches) > 0:
                mnx_tmp = [
                    m.split(":", 1)[1] for m in db_matches["source"].tolist()
                ]
                if db in new_metabolite.annotation.keys():
                    new_metabolite.annotation[db] = list(
                        set(mnx_tmp + new_metabolite.annotation[db])
                    )
                else:
                    new_metabolite.annotation[db] = mnx_tmp

    # Cleanup BiGG annotations
    if not "bigg.metabolite" in new_metabolite.annotation.keys():
        # if no BiGG was found in KEGG, try reverse search in BiGG
        get_BiGG_metabs_annot_via_dbid(new_metabolite, id, "KEGG Compound", compartment)

        # add additional information from BiGG (if ID found)
        add_annotations_from_BiGG_metabs(new_metabolite)

    # step 6: change ID according to namespace
    # ----------------------------------------
    match_id_to_namespace(new_metabolite, namespace)

    # step 7: re-check existence of ID in model
    # -----------------------------------------
    if new_metabolite.id in [_.id for _ in model.metabolites]:
        return model.metabolites.get_by_id(new_metabolite.id)

    return new_metabolite




[docs]
def build_metabolite_bigg(
    id: str,
    model: cobra.Model,
    namespace: Literal["BiGG"] = "BiGG",
    idprefix: str = "refineGEMs",
) -> Union[cobra.Metabolite, None]:
    """Build a cobra.Metabolite object from a BiGG ID.
    This function will NOT directly add the metabolite to the model,
    if the contruction is successful.

    Args:
        - id (str):
            A BiGG ID of a metabolite.
        - model (cobra.Model):
            The model, the metabolite will be build for.
        - namespace (str, optional):
            Name to use for the model ID. If namespace cannot be matched,
            will use a random ID.
            Defaults to 'BiGG'.
        - compartment (str, optional):
            Compartment of the metabolite. Defaults to 'c'.
        - idprefix (str, optional):
            Prefix for the random ID. Defaults to 'refineGEMs'.

    Returns:
        1. Case construction successful or match found in model:
                cobra.Metabolite:
                    The build metabolite object.

        2. Case construction failed:
                None:
                    Nothing to return.
    """

    compartment = id.rsplit("_", 1)[1]
    # ------------------------------------------
    # fast check if compound already in model
    # ------------------------------------------
    # step 1: check if MetaNetX ID in model
    matches = [
        x.id
        for x in model.metabolites
        if "bigg.metabolite" in x.annotation
        and (
            x.annotation["bigg.metabolite"] == id
            or x.annotation["bigg.metabolite"] == id.rsplit("_", 1)[0]
        )
        and x.compartment == compartment
    ]
    # step 2: if yes, retrieve metabolite from model
    # case 1: multiple matches found
    if len(matches) > 0:
        if len(matches) > 1:
            # currently, just the first one is taken
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # -----------------------------
    # if not, create new metabolite
    # -----------------------------
    # get information from the database
    bigg_res = load_a_table_from_database(
        f"SELECT * FROM bigg_metabolites WHERE id = '{id}'", query=True
    )
    if len(bigg_res) > 0:
        bigg_res = bigg_res.iloc[0, :]
    else:
        return None  # not data = no recontruction
    # get information from MNX if ID available
    mnx_res = None
    if bigg_res["MetaNetX (MNX) Chemical"]:
        mnx_res = load_a_table_from_database(
            f'SELECT * FROM mnx_chem_prop WHERE id=\'{bigg_res["MetaNetX (MNX) Chemical"]}\'',
            query=True,
        )
        if len(mnx_res) > 0:
            mnx_res = mnx_res.iloc[0, :]
        else:
            mnx_res = None

    # step 1: create a random metabolite ID
    # -------------------------------------
    new_metabolite = cobra.Metabolite(create_random_id(model, "meta", idprefix))

    # step 2: add features
    # --------------------
    new_metabolite.name = bigg_res["name"]
    new_metabolite.compartment = compartment
    if mnx_res is not None:
        if mnx_res["charge"]:
            new_metabolite.charge = mnx_res["charge"]
        if mnx_res["formula"]:
            new_metabolite.formula = mnx_res["formula"]

    if not new_metabolite.formula or not new_metabolite.charge:
        try:
            bigg_fetch = json.loads(
                requests.get(
                    f'http://bigg.ucsd.edu/api/v2/universal/metabolites/{id.rsplit("_",1)[0]}'
                ).text
            )
            if "formulae" in bigg_fetch.keys() and not new_metabolite.formula:
                new_metabolite.formula = bigg_fetch["formulae"][0]
            if "charges" in bigg_fetch.keys() and new_metabolite.charge:
                new_metabolite.charge = bigg_fetch["charges"][0]
        except Exception as e:
            pass

    # step 3: add notes
    # -----------------
    new_metabolite.notes["created with"] = "refineGEMs based on BiGG"

    # step 4: add annotations
    # -----------------------
    # add SBOTerm
    new_metabolite.annotation["sbo"] = "SBO:0000247"
    # add infos from BiGG
    new_metabolite.annotation["bigg.metabolite"] = [id.removesuffix(f"_{compartment}")]
    add_annotations_from_BiGG_metabs(new_metabolite)
    # add annotations from MNX
    if mnx_res is not None:
        if mnx_res["InChI"] and "inchi" not in new_metabolite.annotation.keys():
            new_metabolite.annotation["inchi"] = mnx_res["InChI"]
        if mnx_res["InChIKey"] and "inchikey" not in new_metabolite.annotation.keys():
            new_metabolite.annotation["inchikey"] = mnx_res["InChIKey"]

    # step 5: change ID according to namespace
    # ----------------------------------------
    match_id_to_namespace(new_metabolite, namespace)

    # step 6: re-check existence of ID in model
    # -----------------------------------------
    if new_metabolite.id in [_.id for _ in model.metabolites]:
        return model.metabolites.get_by_id(new_metabolite.id)

    return new_metabolite



# adding reactions to cobra models
# --------------------------------



[docs]
def parse_reac_str(
    equation: str, type: Literal["BiGG", "BioCyc", "MetaNetX", "KEGG"] = "MetaNetX"
) -> tuple[dict, dict, list, bool]:
    """Parse a reaction string.

    Args:
        - equation (str):
            The equation of a reaction as a string (as saved in the database).
        - type (Literal['BiGG','BioCyc','MetaNetX','KEGG'], optional):
            The name of the database the equation was taken from.
            Can be 'BiGG','BioCyc','MetaNetX','KEGG'.
            Defaults to 'MetaNetX'.

    Returns:
        tuple:
            Tuple of (1) dict, (2) dict, (3) list & (4) bool:

            1. Dictionary with the reactant IDs and their stoichiometric factors.
            2. Dictionary with the product IDs and their stoichiometric factors.
            3. List of compartment IDs or None, if they cannot be extract from the equation.
            4. True, if the reaction is reversible, else False.
    """

    products = {}
    reactants = {}
    compartments = list()
    is_product = False
    reversible = True

    match type:
        case "MetaNetX":
            for s in equation.split(" "):
                # switch from reactants to products
                if s == "=":
                    is_product = True
                # found stoichiometric factor
                elif s.isnumeric():
                    factor = float(s)
                # skip
                elif s == "+":
                    continue
                # found metabolite
                else:
                    # get information from MetaNetX
                    metabolite, compartment = s.split("@")
                    compartments.append(compartment)

                    if is_product:
                        products[metabolite] = factor
                    else:
                        reactants[metabolite] = factor

        case "BiGG":
            factor = 1.0  # BiGG does not use factor 1 in the equations
            for s in equation.split(" "):
                # found factor
                if is_stoichiometric_factor(s):
                    factor = float(s)
                # switch from reactants to products
                elif s == "-->":
                    is_product = True
                    reversible = False
                elif s == "<->":
                    is_product = True
                # skip
                elif s == "+":
                    continue
                # found metabolite
                else:
                    if s:
                        compartments.append(s.rsplit("_", 1)[1])
                        if is_product:
                            products[s] = factor
                        else:
                            reactants[s] = factor
                    factor = 1.0

        case "KEGG":
            compartments = None
            factor = 1.0
            for s in equation.split(" "):
                if s.isnumeric():
                    factor = float(s)
                elif s == "+":
                    continue
                elif s == "<=>":
                    is_product = True
                else:
                    if is_product:
                        products[s] = factor
                    else:
                        reactants[s] = factor
                    factor = 1.0

    return (reactants, products, compartments, reversible)




[docs]
def validate_reaction_compartment_bigg(comps: list) -> Union[bool, Literal["exchange"]]:
    """Retrieves and validates the compatment(s) of a reaction from the BiGG namespace

    Args:
        - comps (list):
            List containing compartments from a BiGG reaction

    Returns:

        (1) Case ``compartment in VALID_COMPARTMENTS.keys()``
                bool|'exchange':
                    Either

                    - True if the provided compartments are valid
                    - 'exchange' if reaction in multiple compartments

        (2) Case not a valid compartment:
                bool:
                    'False' if one of the found compartments is not in :py:const:`~refinegems.utility.util.VALID_COMPARTMENTS`
    """

    contained_in_compartments = [
        (comp in VALID_COMPARTMENTS.keys()) for comp in comps
    ]  # Get True for correct compartment
    if not all(contained_in_compartments):  # At least one compartment not correct
        return False
    else:  # All compartments correct
        if (
            len(set(comps)) == 1
        ):  # Set of found compartments of reaction = 1: Reaction happens in one valid compartment
            return True
        else:  # Probably exchange reaction
            return "exchange"




[docs]
@template
def build_reaction_xxx():
    """
    Extend the build function so, that all of them can take either the id or an equation
    as input for rebuilding the reaction (would also be beneficial for semi-manual curation)

    model:cobra.Model, id:str=None,
                       reac_str:str=None,
                       references:dict={},
                       idprefix:str='refineGEMs',
                       namespace:Literal['BiGG']='BiGG') -> Union[cobra.Reaction, None, list]:
    """
    pass




[docs]
def build_reaction_mnx(
    model: cobra.Model,
    id: str,
    reac_str: str = None,
    references: dict = {},
    idprefix: str = "refineGEMs",
    namespace: Literal["BiGG"] = "BiGG",
) -> Union[cobra.Reaction, None, list]:
    """Construct a new reaction for a model from a MetaNetX reaction ID.
    This function will NOT add the reaction directly to the model, if the
    construction process is successful.

    Args:
        - model (cobra.Model):
            The model loaded with COBRApy.
        - id (str):
            A MetaNetX reaction ID.
        - reac_str (str, optional):
            The reaction equation string from the database.
            Defaults to None.
        - references (dict, optional):
            Additional annotations to add to the reaction (idtype:[value]).
            Defaults to {}.
        - idprefix (str, optional):
            Prefix for the pseudo-identifier. Defaults to 'refineGEMs'.
        - namespace (Literal['BiGG'], optional):
            Namespace to use for the reaction ID.
            If namespace cannot be matched, uses the pseudo-ID
            Defaults to 'BiGG'.

    Returns:
        1. Case successful construction:
                cobra.Reaction:
                    The newly build reaction object.

        2. Case construction not possible:
                None:
                    Nothing to return.

        3. Case reaction found in model.
                list:
                    List of matching reaction IDs (in model).
    """

    # ---------------------
    # check, if ID in model
    # ---------------------
    matches_found = [
        _.id
        for _ in model.reactions
        if "metanetx.reaction" in _.annotation.keys()
        and _.annotation["metanetx.reaction"] == id
    ]
    if len(matches_found) > 0:
        return matches_found

    # -----------------------------
    # otherwise, build new reaction
    # -----------------------------

    # get relevant part of table from database
    mnx_reac_refs = load_a_table_from_database(
        f"SELECT * FROM mnx_reac_xref WHERE id = '{id}'", query=True
    )
    mnx_reac_refs = mnx_reac_refs[
        ~(mnx_reac_refs["description"] == "secondary/obsolete/fantasy identifier")
    ]

    # create reaction object
    new_reac = cobra.Reaction(create_random_id(model, "reac", idprefix))

    # Get names for reaction
    names = []
    reaceqsyms = ["=", "+", "<", ">", "_"]
    important_keywords = ["rna", "dna"]
    for desc in mnx_reac_refs["description"]:
        splitted_descs = re.split(r"\|+", desc)

        # Exclude reaction equations
        names.extend([_ for _ in splitted_descs if not any(s in _ for s in reaceqsyms)])

    # Set name for reaction if possible
    kw_matches = [n for n in names if any(kw in n.lower() for kw in important_keywords)]

    if len(kw_matches) > 0:
        new_reac.name = kw_matches[0]  # Take first entry containing keyword
    elif len(names) > 0:
        new_reac.name = min(names, key=len)  # Take shortest entry
    else:
        new_reac.name = ""

    # get metabolites
    # ---------------
    if not reac_str:
        mnx_reac_prop = load_a_table_from_database(
            f"SELECT * FROM mnx_reac_prop WHERE id = '{id}'", query=True
        )
        reac_str = mnx_reac_prop["mnx_equation"][0]
        if mnx_reac_prop["ec-code"][0]:
            references["ec-code"] = mnx_reac_prop["ec-code"][0]

    if reac_str:
        reactants, products, comparts, rev = parse_reac_str(reac_str, "MetaNetX")
    else:
        return None

    # reac_prop / mnx equation only saves generic compartments 1 and 2 (MNXD1 / MNXD2)
    # current solution for the compartments: 1 -> c, 2 -> e
    comparts = ["c" if _ == "MNXD1" else "e" for _ in comparts]
    metabolites = {}
    meta_counter = 0

    # reconstruct reactants
    for mid, factor in reactants.items():
        tmp_meta = build_metabolite_mnx(
            mid, model, namespace, comparts[meta_counter], idprefix
        )
        if tmp_meta:
            metabolites[tmp_meta] = -1 * factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # reconstruct products
    for mid, factor in products.items():
        tmp_meta = build_metabolite_mnx(
            mid, model, namespace, comparts[meta_counter], idprefix
        )
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # add metabolites to reaction
    new_reac.add_metabolites(metabolites)

    # set reversibility
    if rev:
        new_reac.bounds = cobra.Configuration().bounds
    else:
        new_reac.bounds = (0.0, cobra.Configuration().upper_bound)

    # get annotations
    # ---------------
    new_reac.annotation["sbo"] = "SBO:0000167"
    # get more annotation from the mnx_reac_xref table
    for db in [
        "bigg.reaction",
        "kegg.reaction",
        "seed.reaction",
        "metacyc.reaction",
        "rhea",
    ]:
        db_matches = mnx_reac_refs[mnx_reac_refs["source"].str.contains(db)]
        if len(db_matches) > 0:
            new_reac.annotation[db] = [
                m.split(":", 1)[1] for m in db_matches["source"].tolist()
            ]
            # update reactions direction, if MetaCyc has better information
            if db == "metacyc.reaction" and len(
                db_matches[db_matches["source"].str.contains("-->")]
            ):
                new_reac.bounds = (0.0, cobra.Configuration().upper_bound)
    # get alias annotations for MetaNetX if available
    if "alias" in references.keys():
        if references["alias"] != None:
            references["metanetx.reaction"] = list(references["alias"])
        del references["alias"]

    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references, new_reac)

    # get annotations
    # ---------------
    new_reac.annotation["sbo"] = "SBO:0000167"

    # add notes
    # ---------
    new_reac.notes["created with"] = "refineGEMs based on MetaNetX"

    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac, namespace)
    # re-check, if reaction not in the model based on the namespace
    if new_reac.id in [_.id for _ in model.reactions]:
        return [new_reac.id]

    return new_reac




[docs]
def build_reaction_kegg(
    model: cobra.Model,
    id: str = None,
    reac_str: str = None,
    references: dict = {},
    idprefix: str = "refineGEMs",
    namespace: Literal["BiGG"] = "BiGG",
) -> Union[cobra.Reaction, None, list]:
    """Construct a new reaction for a model from either a KEGG reaction ID
    or a KEGG equation string.
    This function will NOT add the reaction directly to the model, if the
    construction process is successful.

    Args:
        - model (cobra.Model):
            The model loaded with COBRApy.
        - id (str,optional):
            A KEGG reaction ID.
        - reac_str (str, optional):
            The reaction equation string from the database.
            Defaults to None.
        - references (dict, optional):
            Additional annotations to add to the reaction (idtype:[value]).
            Defaults to {}.
        - idprefix (str, optional):
            Prefix for the pseudo-identifier. Defaults to 'refineGEMs'.
        - namespace (Literal['BiGG'], optional):
            Namespace to use for the reaction ID.
            If namespace cannot be matched, uses the pseudo-ID
            Defaults to 'BiGG'.

    Returns:
        1. Case successful construction:
                cobra.Reaction:
                    The newly build reaction object.

        2. Case construction not possible:
                None:
                    Nothing to return.

        3. Case reaction found in model.
                list:
                    List of matching reaction IDs (in model).
    """

    # either reaction id or a reaction string needed for reconstruction
    if not id and not reac_str:
        return None  # reconstruction not possible

    # create an empty reaction with random id
    new_reac = cobra.Reaction(create_random_id(model, "reac", idprefix))

    # -------------
    # KEGG ID given
    # -------------
    if id:
        # check, if reaction in model
        matches = [
            _.id
            for _ in model.reactions
            if "kegg.reaction" in _.annotation.keys()
            and _.annotation["kegg.reaction"] == id
        ]
        if len(matches) > 0:
            return matches  # return matched reaction ids in list

        # retrieve information from KEGG
        kegg_info = kegg_reaction_parser(id)
        if kegg_info:
            if "name" in kegg_info.keys():
                new_reac.name = kegg_info["name"]
            if "equation" in kegg_info.keys():
                reac_str = kegg_info["equation"] if not reac_str else reac_str
            if "db" in kegg_info.keys():
                new_reac.annotation = kegg_info["db"]

    # -------------------------------------
    # Reaction reconstruction from equation
    # -------------------------------------

    # skip, if no reaction string is available
    if not reac_str:
        return None  # reconstruction not possible

    # parse reaction string
    reactants, products, comparts, rev = parse_reac_str(reac_str, "KEGG")

    # KEGG has no information about compartments !!!
    # current solution for the compartments: always use c
    compartment = "c"
    metabolites = {}
    meta_counter = 0

    # reconstruct reactants
    for mid, factor in reactants.items():
        tmp_meta = build_metabolite_kegg(mid, model, namespace, compartment, idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = -1 * factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # reconstruct products
    for mid, factor in products.items():
        tmp_meta = build_metabolite_kegg(mid, model, namespace, compartment, idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # add metabolites to reaction
    new_reac.add_metabolites(metabolites)

    # set reversibility
    if rev:
        new_reac.bounds = (1000.0, 1000.0)
    else:
        new_reac.bounds = (0.0, 1000.0)

    # --------------------
    # add more information
    # --------------------
    new_reac.annotation["sbo"] = "SBO:0000167"
    # get more information from searching the KEGG ID in BiGG
    bigg_res = load_a_table_from_database(
        f"SELECT * FROM bigg_reactions WHERE \"KEGG Reaction\" = '{id}'", query=True
    )
    is_exchange = 0  # check for exchange reactions
    for idx, row in bigg_res.iterrows():
        r, p, compartments, r = parse_reac_str(row["reaction_string"], "BiGG")
        valid_comp = validate_reaction_compartment_bigg(compartments)

        if valid_comp:
            new_reac.annotation["bigg.reaction"] = row["id"]
            _add_annotations_from_bigg_reac_row(row, new_reac)
            if valid_comp == "exchange":
                is_exchange += 1
    new_reac.name = f"EX_{new_reac.name}" if is_exchange > 0 else new_reac.name

    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references, new_reac)

    # add notes
    # ---------
    new_reac.notes["created with"] = "refineGEMs based on KEGG"

    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac, namespace)
    # re-check, if reaction not in the model based on the namespace
    if new_reac.id in [_.id for _ in model.reactions]:
        return [new_reac.id]

    return new_reac




[docs]
def build_reaction_bigg(
    model: cobra.Model,
    id: str,
    reac_str: str = None,
    references: dict = {},
    idprefix: str = "refineGEMs",
    namespace: Literal["BiGG"] = "BiGG",
) -> Union[cobra.Reaction, None, list]:
    """Construct a new reaction for a model from a BiGG reaction ID.
    This function will NOT add the reaction directly to the model, if the
    construction process is successful.

    Args:
        - model (cobra.Model):
            The model loaded with COBRApy.
        - id (str):
            A BiGG reaction ID.
        - reac_str (str, optional):
            The reaction equation string from the database.
            Currently, this param is not doing anything in this function.
            Defaults to None.
        - references (dict, optional):
            Additional annotations to add to the reaction (idtype:[value]).
            Defaults to {}.
        - idprefix (str, optional):
            Prefix for the pseudo-identifier. Defaults to 'refineGEMs'.
        - namespace (Literal['BiGG'], optional):
            Namespace to use for the reaction ID.
            If namespace cannot be matched, uses the pseudo-ID
            Defaults to 'BiGG'.

    Returns:
        1. Case successful construction:
                cobra.Reaction:
                    The newly build reaction object.

        2. Case construction not possible:
                None:
                    Nothing to return.

        3. Case reaction found in model.
                list:
                    List of matching reaction IDs (in model).
    """

    # ---------------------
    # check, if ID in model
    # ---------------------
    matches_found = [
        _.id
        for _ in model.reactions
        if "bigg.reaction" in _.annotation.keys()
        and _.annotation["bigg.reaction"] == id
    ]
    if len(matches_found) > 0:
        return matches_found

    # -----------------------------
    # otherwise, build new reaction
    # -----------------------------
    # create reaction object
    new_reac = cobra.Reaction(create_random_id(model, "reac", idprefix))

    # get information from the database
    bigg_reac_info = load_a_table_from_database(
        f"SELECT * FROM bigg_reactions WHERE id = '{id}'", query=True
    ).iloc[0, :]
    new_reac.name = bigg_reac_info["name"]

    # get information from the reaction string
    reactants, products, comparts, rev = parse_reac_str(
        bigg_reac_info["reaction_string"], "BiGG"
    )

    # validate & add compartment information
    # --------------------------------------
    valid_comp = validate_reaction_compartment_bigg(comparts)
    if valid_comp:
        if valid_comp == "exchange":
            new_reac.name = f"EX_{new_reac.name}"
    else:
        return None

    # add metabolites
    # ---------------
    metabolites = {}
    meta_counter = 0
    # reconstruct reactants
    for mid, factor in reactants.items():
        tmp_meta = build_metabolite_bigg(mid, model, namespace, idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = -1 * factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # reconstruct products
    for mid, factor in products.items():
        tmp_meta = build_metabolite_bigg(mid, model, namespace, idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None  # not able to build reaction successfully

    # add metabolites to reaction
    new_reac.add_metabolites(metabolites)

    # set reversibility
    if rev:
        new_reac.bounds = (1000.0, 1000.0)
    else:
        new_reac.bounds = (0.0, 1000.0)

    # add annotations
    # ---------------
    # add SBOTerm
    new_reac.annotation["sbo"] = "SBO:0000167"
    # add infos from BiGG
    new_reac.annotation["bigg.reaction"] = [id]
    _add_annotations_from_bigg_reac_row(bigg_reac_info, new_reac)
    # get alias annotations for MetaNetX if available
    if "alias" in references.keys():
        if references["alias"] != None:
            references["metanetx.reaction"] = list(references["alias"])
        del references["alias"]
    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references, new_reac)

    # add notes
    # ---------
    new_reac.notes["created with"] = "refineGEMs based on BiGG"

    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac, namespace)
    # re-check, if reaction not in the model based on the namespace
    if new_reac.id in [_.id for _ in model.reactions]:
        return [new_reac.id]

    return new_reac




[docs]
@implement
# maybe for later, if we need something to work independantly from namespace
# and outside the gapfilling
# Or would it be better to have a function add_reaction() that only adds
# reactions built with the previuous functions?
def build_reaction():
    pass



# remove genes from model
# -----------------------



[docs]
def remove_non_essential_genes(
    model: cobra.Model,
    genes_to_check: Union[list[cobra.Gene], None] = None,
    remove_reactions: bool = True,
    min_growth_threshold: float = MIN_GROWTH_THRESHOLD,
) -> tuple[int, int]:
    """Remove non-essential genes based on gene knock-out.

    Args:
        - model (cobra.Model):
            The model to delete the genes from.
        - genes_to_check (Union[list[cobra.Gene],None], optional):
            List of genes to check. If None, all genes of the model are checked.
            Defaults to None.
        - remove_reactions (bool, optional):
            If set to True, also remove corresponding reactions.
            Defaults to True.
        - min_growth_threshold (float, optional):
            Minimal value for growth value.
            Defaults to MIN_GROWTH_THRESHOLD.

    Returns:
        tuple[int,int]:
            Tuple of (1) int, (2) int:
                (1) Number of deleted genes.
                (2) Number of essential genes.
    """

    # initialise values
    to_delete = 0
    essential_counter = 0
    remove = False

    # if no list of genes is given, take all genes of model
    if not genes_to_check:
        genes_to_check = model.genes

    # iterate over all genes of interest
    for g in genes_to_check:
        # test knock out on a model copy
        with model as m:
            # knock out current gene
            g.knock_out()
            res = model.optimize()
            # make sure, model still reaches minimal growth
            if res.objective_value > min_growth_threshold:
                # set for deletion
                remove = True
                to_delete += 1
            else:
                # keep
                essential_counter += 1

        # delete gene
        if remove:
            cobra.manipulation.delete.remove_genes(
                model, [g], remove_reactions=remove_reactions
            )
            remove = False

    return (to_delete, essential_counter)



# ++++++++++++++++++++++++++++++++++++++++
# libSBML - models
# ++++++++++++++++++++++++++++++++++++++++

# extracting reactions & Co via libsbml
# -------------------------------------


# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified
# Might be possible to deprecate

[docs]
def compare_gene_lists(
    gps_in_model: pd.DataFrame, db_genes: pd.DataFrame, kegg: bool = True
) -> pd.DataFrame:
    """Compares the provided tables according to column 0/'Locus_tag'

    Args:
        - gps_in_model (pd.DataFrame):
            Table containing the KEGG Gene IDs/Locus tags in the model
        - db_genes (pd.DataFrame):
            Table containing the KEGG Gene IDs for the organism from KEGG/
            locus tags (Accession-2) from BioCyc
        - kegg (bool):
            True if KEGG Genes should be extracted, otherwise False

    Returns:
        pd.DataFrame:
            Table containing all missing genes
    """
    in_db = db_genes.set_index(0) if kegg else db_genes.set_index("locus_tag")
    in_model = gps_in_model.set_index(0)

    genes_in_db_not_in_model = in_db[~in_db.index.isin(in_model.index)]

    return (
        genes_in_db_not_in_model.reset_index().iloc[:, 0]
        if kegg
        else genes_in_db_not_in_model.reset_index()
    )



# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified

[docs]
def get_model_reacs_or_metabs(
    model_libsbml: libModel, metabolites: bool = False, col_name: str = "bigg_id"
) -> pd.DataFrame:
    """Extracts table of reactions/metabolites with BiGG IDs from model

    Args:
        - model_libsbml (libModel):
            Model loaded with libSBML
        - metabolites (bool):
            Set to True if metabolites from model should be extracted
        - col_name (str):
            Name to be used for column in table. Defaults to 'bigg_id'.

    Returns:
        pd.DataFrame:
            Table with model identifiers for either metabolites or reactions
    """
    reac_or_metab_list = (
        model_libsbml.getListOfSpecies()
        if metabolites
        else model_libsbml.getListOfReactions()
    )

    list_of_reacs_or_metabs = []
    for reac_or_metab in reac_or_metab_list:
        list_of_reacs_or_metabs.append(reac_or_metab.id[2:])

    reac_or_metab_list_df = pd.Series(list_of_reacs_or_metabs)
    reac_or_metab_list_df = pd.DataFrame(reac_or_metab_list_df, columns=[col_name])

    return reac_or_metab_list_df




[docs]
def get_reversible(fluxes: dict[str:str]) -> bool:
    """Infer if reaction is reversible from flux bounds

    Args:
        - fluxes (dict):
            Dictionary containing the keys 'lower_bound' & 'upper_bound'
            with values in ['cobra_default_lb', 'cobra_0_bound', 'cobra_default_ub']

    Returns:
        bool:
            True if reversible else False
    """
    return (fluxes["lower_bound"] == "cobra_default_lb") and (
        fluxes["upper_bound"] == "cobra_default_ub"
    )



# map model entities via libSBML
# ------------------------------



[docs]
def get_gpid_mapping(
    model: libModel,
    gff_paths: Union[str,list[str]] = None,
    email: str = None,
    contains_locus_tags: bool = False,
    outpath: Union[str, Path] = None,
) -> pd.DataFrame:
    """Generate a mapping from model IDs to valid database IDs via model content, GFF files (optional)
    and NCBI requests (optional).

    .. hint:

        Mappings may be incomplete and require manual adjustment.

    .. note:

        Mapping currently is only implemented for NCBI Protein and RefSeq IDs.
        Other databases may be added in the future.

    Args:
        - model (libModel):
            Model loaded with libSBML
        - gff_paths (str|list[str]):
            Path(s) to GFF file(s). Allowed GFF formats are: RefSeq, NCBI and Prokka.
            This is only used when mapping_tbl_file == None.
            Defaults to None.
        - email (str):
            E-mail for NCBI queries.
            This is only used when mapping_tbl_file == None.
            Defaults to None.
        - contains_locus_tags (bool, optional):
            Specifies if provided model has locus tags within the label tag if set to True.
            This is only used when mapping_tbl_file == None.
            Defaults to False.
        - outpath (str|Path, optional):
            Output path for location where the generated mapping table should be written to.
            This is only used when mapping_tbl_file == None.
            Defaults to None.

    Returns:
        pd.DataFrame:
            Mapping from model IDs to valid database IDs
    """
    # Set-up path
    if outpath:
        if isinstance(outpath, str):
            outpath = Path(outpath)
        outpath.mkdir(parents=True, exist_ok=True)

    # Helper function to classify potential database IDs according to db specific regexes
    def _classify_potential_db_ids(row: pd.Series) -> pd.Series:
        """Calssifies potential database IDs according to db specific regexes

        Args:
            - row (pd.Series): 
                Row of a DataFrame containing the potential database ID

        Returns:
            pd.Series: 
                Row of the DataFrame with classified database IDs
        """
        # Get NCBI ID
        ncbi_id = row["database_id"] if not pd.isnull(row["database_id"]) else ''

        # Get locus_tag
        locus_tag = row.get("locus_tag", '') if not pd.isnull(row.get("locus_tag")) else ''

        # Check that potential database ID is not the locus_tag
        if (not re.fullmatch(locus_tag, ncbi_id, re.IGNORECASE)) or ('model_id' in row.index.to_list()):
        
            # If identifier matches RefSeq ID pattern
            if re.fullmatch(rf'{DB2REGEX["refseq"]}', ncbi_id, re.IGNORECASE):
                row["REFSEQ"] = row["database_id"]

            # If identifier matches ncbiprotein ID pattern
            elif re.fullmatch(rf'{DB2REGEX["ncbiprotein"]}$', ncbi_id, re.IGNORECASE):
                row["NCBI"] = row["database_id"]

            # No pattern match & no locus_tag match
            else:
                row["UNCLASSIFIED"] = row["database_id"]

        return row

    # 1. Get model IDs & potential contained IDs
    print("Extracting model IDs and potential valid database IDs from model...")
    gene_list = model.getPlugin("fbc").getListOfGeneProducts()
    modelid2potentialid = {"model_id": [], "database_id": []}

    for gene in tqdm(gene_list):

        # Get locus_tag if available
        if contains_locus_tags:
            modelid2potentialid["locus_tag"] = (
                gene.getLabel() if gene.isSetLabel() else None
            )

        # Get current gene id
        gene_id = gene.getId()
        modelid2potentialid["model_id"].append(gene_id)

        # Get gene_id without prefix & replace ascii numbers with according characters
        gene_id = _f_gene(gene_id)

        # Extract potential database ID from gene_id
        if (gene_id.lower() != "spontaneous") and (
            gene_id.lower() != "unknown"
        ):  # Has to be omitted as no additional data can be retrieved neither from NCBI nor the CarveMe input file
            # Case 1: CarveMe version; Contains NCBI Protein ID, lcl_CP035291_1_prot_QCY37541_1_361
            if "prot_" in gene_id:
                 # All NCBI CDS protein FASTA files have the NCBI protein identifier + Version after 'prot_' in the FASTA identifier
                id_string = gene_id.split(r"prot_")[1].split(r"_")
                # If identifier contains no '_', this is full identifier + add version number
                ncbi_id = '.'.join(id_string[0:2])

            # Case 2: RAST contained without any surrounding stuff, 1134914.3.1020_56.peg
            # Case 3: RefSeq contained without any surrounding stuff, WP_011275285.1, From CarveMe: YP_005228568_1
            # Case 4: KEGG contained without any surrounding stuff, sha:SH1836
            # Case 5: Model ID contains locus_tag, AB-1_S128_00983
            else:
                ncbi_id = gene_id

        # Add potential database ID to dictionary
        modelid2potentialid["database_id"].append(ncbi_id)

    # Generate table from dictionary
    mapping_table = pd.DataFrame.from_dict(modelid2potentialid)

    # Add empty column for REFSEQ, NCBI and UNCLASSIFIED IDs
    mapping_table["REFSEQ"] = None
    mapping_table["NCBI"] = None
    mapping_table["UNCLASSIFIED"] = None

    # Classify potential database IDs according to db specific regexes
    print('Classifying potential database IDs according to db specific regexes...')
    mapping_table = mapping_table.progress_apply(_classify_potential_db_ids, axis=1)
    mapping_table.drop("database_id", axis=1, inplace=True)

    # Drop all columns containing only NaNs
    mapping_table = mapping_table.dropna(axis=1, how="all")

    # 1.1 (Optional) Get information from GFF(s)
    if gff_paths:
        print("Extracting (protein id,) locus tag and name from GFF(s)...")

        # Identical attributes to keep per GFF
        to_keep = {"locus_tag": "locus_tag", "product": "name"}

        # Parse & collect all provided GFFs
        gffs = []
        if isinstance(gff_paths, str): gff_paths = [gff_paths]
        for gffp in tqdm(gff_paths):
            current_gff = parse_gff_for_cds(gffp)

            # Check if protein_id in columns
            if "protein_id" in current_gff.columns: to_keep.update({"protein_id": "database_id"})

            # Subset dataframe to only keep relevant columns & rename them
            current_gff = current_gff[to_keep.keys()]
            current_gff = current_gff.rename(columns=to_keep)

            # Explode columns containing lists to get single entries per row
            current_gff = current_gff.explode(column=current_gff.columns.to_list(),ignore_index=True)
            current_gff = current_gff.drop_duplicates()
            current_gff = current_gff.reset_index(drop=True)

            # If protein_id/database_id is available, map it to according database
            if "database_id" in current_gff.columns:

                # Add empty column for REFSEQ, NCBI and UNCLASSIFIED IDs
                current_gff["REFSEQ"] = None
                current_gff["NCBI"] = None
                current_gff["UNCLASSIFIED"] = None

                # Classify potential database IDs according to db specific regexes
                print('Classifying potential database IDs according to db specific regexes...')
                current_gff = current_gff.progress_apply(_classify_potential_db_ids, axis=1)
                current_gff.drop("database_id", axis=1, inplace=True)
                
                # Drop all columns containing only NaNs
                current_gff = current_gff.dropna(axis=1, how="all")

                # Rename columns to avoid duplicates
                if not ('NCBI' in current_gff.columns and 'REFSEQ' in current_gff.columns):
                    if 'NCBI' in current_gff.columns: current_gff.rename(columns={"name": "name_ncbi"}, inplace=True)
                    if 'REFSEQ' in current_gff.columns: current_gff.rename(columns={"name": "name_refseq"}, inplace=True)

            # Add current_gff to GFF list
            gffs.append(current_gff)

        # Merge all GFFs on common column locus_tag
        if len(gffs) == 1:
            gff_mapping = gffs[0]
        else:
            gff_mapping = reduce(
                lambda left, right: pd.merge(
                    left, right, on=["locus_tag"], how="outer", sort=True
                ),
                gffs,
            )

        # Find identical columns in mapping_table & gff_mapping
        identical_columns = list(
            set(mapping_table.columns).intersection(set(gff_mapping.columns))
        )

        # Merge both dataframes
        # If merge on identical columns is possible
        if identical_columns:
            mapping_table = pd.merge(
                mapping_table, gff_mapping, how="outer", on=identical_columns
            )
        else:  
            # Try merge on locus_tag & UNCLASSIFIED as UNCLASSIFIED could contain locus tags
            try:
                mapping_table = pd.merge(
                    mapping_table,
                    gff_mapping,
                    how="outer",
                    left_on="UNCLASSIFIED",
                    right_on="locus_tag",
                )
            except KeyError:
                filename = (
                    f'{model.getId()}_gp_id_gff_mapping_{str(date.today().strftime("%Y%m%d"))}.csv'
                    )
                if outpath:
                    filename = Path(outpath, filename)
                else:
                    filename = Path(filename)
                print(f'''
No common columns found between mapping table derived from model and mapping table derived from provided GFFs. Cannot merge.
The resulting mapping tables will be returned separately. The table for the GFF mapping is written to {filename}.
                ''')
                gff_mapping.to_csv(filename, index=False)

    # 2. Get information from NCBI
    contains_protein_ids = len({'NCBI', 'REFSEQ'}.intersection(set(mapping_table.columns))) > 0
    if email and contains_protein_ids:
        Entrez.email = email
        print("Retrieve locus tag and name information from NCBI...")

        # Add empty columns for names and locus tags if not already contained
        if not "locus_tag" in mapping_table.columns:
            mapping_table["locus_tag"] = None

        contains_only_name = ('name' in mapping_table.columns) and not ('name_refseq' in mapping_table.columns) and not ('name_ncbi' in mapping_table.columns)
        if not any(["name" in _ for _ in mapping_table.columns]) or contains_only_name:
            if ('NCBI' in current_gff.columns and 'REFSEQ' in current_gff.columns):
                mapping_table["name_ncbi"] = None
                mapping_table["name_refseq"] = None
            elif 'NCBI' in current_gff.columns:
                mapping_table["name_ncbi"] = None
            elif 'REFSEQ' in current_gff.columns:
                mapping_table["name_refseq"] = None

        # Query NCBI based on RefSeq Protein ID
        print('Querying NCBI based on RefSeq Protein IDs...')
        if "REFSEQ" in mapping_table.columns:
            mapping_table = mapping_table.progress_apply(
                _search_ncbi_for_gp, axis=1, args=("refseq",)
            )

        # Query NCBI based on NCBI Protein ID
        print('Querying NCBI based on NCBI Protein IDs...')
        if "NCBI" in mapping_table.columns:
            mapping_table = mapping_table.progress_apply(
                _search_ncbi_for_gp, axis=1, args=("ncbiprotein",)
            )

    # Clean-up dataframe for output
    def _merge_name_cols(row: pd.Series) -> pd.Series:
        """Merge all name columns into one column name

        Args:
            - row (pd.Series):
                Row of a DataFrame

        Returns:
            pd.Series:
                DataFrame with only one name column
        """
        # Prefer NCBI Protein name over all other names as most specific for organism
        if 'name_ncbi' in row.index.to_list():
            if row['name_ncbi']:
                row['name'] = row['name_ncbi'] if row['name_ncbi'] != row['NCBI'] else None
            # If no 'valid' NCBI Protein name was found, use REFSEQ name
            elif 'name_refseq' in row.index.to_list():
                if row['name_refseq']:
                    row['name'] = row['name_refseq'] if row['name_refseq'] != row['REFSEQ'] else None
        # If no NCBI Protein name column exists, use REFSEQ name
        elif 'name_refseq' in row.index.to_list():
            if row['name_refseq']:
                row['name'] = row['name_refseq'] if row['name_refseq'] != row['REFSEQ'] else None

        # If no REFSEQ and no NCBI Protein name column exists, check name column for locus_tag
        if row['name'] and (row['name'] == row['locus_tag']):
            row['name'] = None

        # Default name is whatever is stored in name column, if exists, or None
        return row

    print('Cleaning up dataframe for output...')
    # Generate default column name to set content
    if not 'name' in mapping_table.columns:
        mapping_table["name"] = None
    mapping_table = mapping_table.progress_apply(_merge_name_cols, axis=1)
    # Remove unnecessary name columns, if they exist
    names_to_remove = [n_col for n_col in mapping_table.columns if n_col.startswith("name_")]
    if names_to_remove: mapping_table.drop(names_to_remove, axis=1, inplace=True)

    # Write mapping information to file
    filename = (
        f'{model.getId()}_gp_id_mapping_{str(date.today().strftime("%Y%m%d"))}.csv'
    )
    if outpath:
        filename = Path(outpath, filename)
    else:
        filename = Path(filename)
    logging.info(
        f"The mapping table mapping the geneProduct IDs of the provided model {model.getId()} to the names and locus "
        + f"tags is saved to {filename}"
    )
    mapping_table.to_csv(filename, index=False)

    return mapping_table



# create model entities using libSBML
# -----------------------------------


[docs]
def create_gp(
    model: libModel,
    protein_id: str,
    model_id: str = None,
    name: str = None,
    locus_tag: str = None,
    reference: dict[str : tuple[Union[list, str], bool]] = dict(),
    sanity_check: bool = False
) -> None:
    """Creates GeneProduct in the given libSBML model.

    Args:
        - model (libModel):
            The model object, loaded with libSBML.
        - protein_id (str):
            (NCBI) Protein ID of the gene.
        - model_id (str, optional):
            If given, uses this string as the ID of the gene in the model.
            ID should be identical to ID that CarveMe adds from the NCBI FASTA input file.
            Defaults to None.
        - name (str, optional):
            Name of the GeneProduct. Defaults to None.
        - locus_tag (str, optional):
            Genome-specific locus tag. Will be used as label in the model.
            Defaults to None.
        - reference (dict, optional):
            Dictionary containing references for the gene product.
            The key is the database name, the value is a tuple with the first element being the ID(s)
            and the second element being a boolean indicating if the strain is a lab strain or not.
            Defaults to an empty dictionary.
        - sanity_check (bool, optional):
            Check, whether locus tag (label) or model ID (ID) already exist in model.
            Note, that setting this to True increases the runtime.
            Defaults to False.
    """

    # check, if possible ID has been passed
    if model_id:  # ID
       pass
    elif protein_id:
        model_id = _f_gene_rev(protein_id)
    elif locus_tag:
        model_id = _f_gene_rev(locus_tag)
    else:
        raise ValueError(
            "No valid ID found for gene product. Specify at least the locus tag."
        )
    
    # construct label, if possible
    label = None
    if locus_tag:
        label = _f_gene_rev(locus_tag, "")
    
    # sanity check
    if sanity_check:
        genes = model.getPlugin(0).getListOfGeneProducts()
        for g in genes:
            # for ID 
            if g.isSetId() and g.getId == model_id:
                logging.warning(f'Cannot add gene product, as ID {model_id} is alreaedy in the model.')
                return None
            # for locus tag as label
            if label and g.isSetLabel() and g.getLabel() == label:
                logging.warning(f'Cannot add gene product, as label {label} is already in the model.')
                return None
        
    # create gene product object
    gp = model.getPlugin(0).createGeneProduct()
    # set basic attributes
    gp.setIdAttribute(model_id)
    if name:
        gp.setName(name)  # Name
    if label:
        gp.setLabel(label)  # Label
    gp.setSBOTerm("SBO:0000243")  # SBOterm
    gp.setMetaId(f"meta_{_f_gene_rev(protein_id)}")  # Meta ID
    # test for NCBI/RefSeq
    id_db = None
    if re.fullmatch(
        r"^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|WP|XM|XP|XR|YP|ZP)_\d+)|(NZ_[A-Z]{2,4}\d+))(\.\d+)?$",
        protein_id,
        re.IGNORECASE,
    ):
        id_db = "REFSEQ"
    elif re.fullmatch(r"^(\w+\d+(\.\d+)?)|(NP_\d+)$", protein_id, re.IGNORECASE):
        id_db = "NCBI"
    if id_db:
        add_cv_term_genes(protein_id, id_db, gp)  # NCBI protein

    # add further references
    # references {'dbname':(list_or_str_of_id,lab_strain_bool)}
    if reference and len(reference) > 0:
        for dbname, specs in reference.items():
            match specs[0]:
                case list():
                    for s in specs[0]:
                        add_cv_term_genes(s, REF_COL_GF_GENE_MAP[dbname], gp, specs[1])
                case str():
                    add_cv_term_genes(
                        specs[0], REF_COL_GF_GENE_MAP[dbname], gp, specs[1]
                    )
                case None:
                    pass
                case _:
                    raise ValueError(f"Unexpected type for reference value: {specs[0]}")




[docs]
def create_species(
    model: libModel,
    metabolite_id: str,
    name: str,
    compartment_id: str,
    charge: int,
    chem_formula: str,
) -> tuple[Species, libModel]:
    """Creates Species/Metabolite in the given model

    Args:
        - model (libModel):
            Model loaded with libSBML
        - metabolite_id (str):
            Metabolite ID within model (If model from CarveMe, preferable a BiGG ID)
        - name (str):
            Name of the metabolite
        - compartment_id (str):
            ID of the compartment where metabolite resides
        - charge (int):
            Charge for the metabolite
        - chem_formula (str):
            Chemical formula for the metabolite

    Returns:
        tuple:
            libSBML Species (1) & libSBML model (2)

            (1) Species: Created species/metabolite
            (2) libModel: Model containing the created metabolite
    """
    metabolite = model.createSpecies()
    metabolite.setId(f"M_{metabolite_id}")
    if name:
        metabolite.setName(name)
    metabolite.setMetaId(f"meta_M_{metabolite_id}")
    metabolite.setSBOTerm("SBO:0000247")
    metabolite.setInitialAmount(float("NaN"))
    metabolite.setHasOnlySubstanceUnits(True)
    metabolite.setBoundaryCondition(False)
    metabolite.setConstant(False)
    metabolite.setCompartment(compartment_id)
    metabolite.getPlugin(0).setCharge(charge)
    metabolite.getPlugin(0).setChemicalFormula(chem_formula)
    add_cv_term_metabolites(metabolite_id[:-2], "BIGG", metabolite)
    return metabolite, model




[docs]
def create_reaction(
    model: libModel,
    reaction_id: str,
    name: str,
    reactants: dict[str:int],
    products: dict[str:int],
    fluxes: dict[str:str],
    reversible: bool = None,
    fast: bool = None,
    compartment: str = None,
    sbo: str = None,
    genes: Union[str, list[str]] = None,
) -> tuple[Reaction, libModel]:
    """Creates new reaction in the given model

    Args:
        - model (libModel):
            Model loaded with libSBML
        - reaction_id (str):
            BiGG ID of the reaction to create
        - name (str):
            Human readable name of the reaction
        - reactants (dict):
            Metabolites as keys and their stoichiometry as values
        - products (dict):
            Metabolites as keys and their stoichiometry as values
        - fluxes (dict):
            Dictionary with lower_bound and upper_bound as keys
        - reversible (bool):
            True/False for the reaction
        - fast (bool):
            True/False for the reaction
        - compartment (str):
            BiGG compartment ID of the reaction (if available)
        - sbo (str):
            SBO term of the reaction
        - genes (str|list):
            List of genes belonging to reaction

    Returns:
        tuple:
            libSBML reaction (1) & libSBML model (2)

            (1) Reaction: Created reaction
            (2) libModel: Model containing the created reaction
    """
    reaction = model.createReaction()
    reaction.setId("R_" + reaction_id)
    if name:
        reaction.setName(name)
    reaction.setMetaId("meta_R_" + reaction_id)
    sbo = (
        sbo if sbo else "SBO:0000167"
    )  # SBO term for biochemical or transport reaction
    reaction.setSBOTerm(sbo)
    fast = fast if fast else False
    reaction.setFast(fast)
    if compartment:
        reaction.setCompartment(
            compartment
        )  # Set compartment for reaction if available
    reversible = reversible if reversible else get_reversible(fluxes)
    reaction.setReversible(reversible)
    if genes:
        if genes == "G_spontaneous":
            reaction.getPlugin(
                0
            ).createGeneProductAssociation().createGeneProductRef().setGeneProduct(gene)
        elif len(genes) == 1:
            reaction.getPlugin(
                0
            ).createGeneProductAssociation().createGeneProductRef().setGeneProduct(
                genes[0]
            )
        else:
            gp_ass_or = reaction.getPlugin(0).createGeneProductAssociation().createOr()
            for gene in genes:
                # Set GeneProductReferences if available
                gp_ass_or.createGeneProductRef().setGeneProduct(gene)
    for metab, stoich in reactants.items():  # reactants as dict with metab:stoich
        reaction.addReactant(model.getSpecies("M_" + metab), stoich)
    for metab, stoich in products.items():  # reactants as dict with metab:stoich
        reaction.addProduct(model.getSpecies("M_" + metab), stoich)
    reaction.getPlugin(0).setLowerFluxBound(fluxes["lower_bound"])
    reaction.getPlugin(0).setUpperFluxBound(fluxes["upper_bound"])
    add_cv_term_reactions(reaction_id, "BIGG", reaction)
    return reaction, model




[docs]
def create_gpr(reaction: Reaction, gene: Union[str, list[str]]) -> None:
    """For a given libSBML Reaction and a gene ID or a list of gene IDs,
    create a gene production rule inside the reaction.

    Currently only supports 'OR' causality.

    Args:
        - reaction (libsbml.Reaction):
            The reaction object to add the GPR to.
        - gene (str | list[str]):
            Either a gene ID or a list of gene IDs, that will be added to the GPR
            (OR causality).
    """

    # Step 1: test, if there is already a gpr
    # ---------------------------------------
    old_association_str = None
    old_association_fbc = None
    if reaction.getPlugin(0).getGeneProductAssociation():
        old_association = (
            reaction.getPlugin(0).getGeneProductAssociation().getListOfAllElements()
        )
        # case 1: only a single association
        if len(old_association) == 1 and isinstance(old_association[0], GeneProductRef):
            old_association_str = old_association[0].getGeneProduct()
        # case 2: nested structure of asociations
        elif isinstance(old_association[0], FbcOr) or isinstance(
            old_association[0], FbcAnd
        ):
            old_association_fbc = old_association[0].clone()
            # this should get the highest level association (that includes all others)

    # Step 2: create new gene product association
    # -------------------------------------------
    if old_association_str and isinstance(gene, str):
        gene = [old_association_str, gene]
    elif old_association_str and isinstance(gene, list):
        gene.append(old_association_str)

    # add the old association rule as an 'OR' (if needed)
    if not old_association_fbc:
        new_association = reaction.getPlugin(0).createGeneProductAssociation()
    else:
        new_association = (
            reaction.getPlugin(0).createGeneProductAssociation().createOr()
        )
        new_association.addAssociation(old_association_fbc)

    # add the remaining genes
    if isinstance(gene, str):
        new_association.createGeneProductRef().setGeneProduct(gene)
    elif isinstance(gene, list) and len(gene) == 1:
        new_association.createGeneProductRef().setGeneProduct(gene[0])
    elif isinstance(gene, list) and len(gene) > 1:
        gpa_or = new_association.createOr()
        for i in gene:
            gpa_or.createGeneProductRef().setGeneProduct(i)




[docs]
def create_unit(
    model_specs: tuple[int],
    meta_id: str,
    kind: str,
    e: int,
    m: int,
    s: int,
    uri_is: str = "",
    uri_idf: str = "",
) -> Unit:
    """Creates unit for SBML model according to arguments

    Args:
        - model_specs (tuple):
            Level & Version of SBML model
        - meta_id (str):
            Meta ID for unit (Neccessary for URI)
        - kind (str):
            Unit kind constant (see libSBML for available constants)
        - e (int):
            Exponent of unit
        - m (int):
            Multiplier of unit
        - s (int):
            Scale of unit
        - uri_is (str):
            URI supporting the specified unit
        - uri_idf (str):
            URI supporting the derived from unit

    Returns:
        Unit:
            libSBML unit object
    """
    unit = Unit(*model_specs)
    unit.setKind(kind)
    unit.setMetaId(f"meta_{meta_id}_{unit.getKind()}")
    unit.setExponent(e)
    unit.setMultiplier(m)
    unit.setScale(s)
    if uri_is:
        add_cv_term_units(uri_is, unit, BQM_IS)
    if uri_idf:
        add_cv_term_units(uri_idf, unit, BQM_IS_DERIVED_FROM)
    return unit




[docs]
def create_unit_definition(
    model_specs: tuple[int], identifier: str, name: str, units: list[Unit]
) -> UnitDefinition:
    """Creates unit definition for SBML model according to arguments

    Args:
        - model_specs (tuple):
            Level & Version of SBML model
        - identifier (str):
            Identifier for the defined unit
        - name (str):
            Full name of the defined unit
        - units (list):
            All units the defined unit consists of

    Returns:
        UnitDefinition:
            libSBML unit definition object
    """
    unit_definition = UnitDefinition(*model_specs)
    unit_definition.setId(identifier)
    unit_definition.setMetaId(f"meta_{identifier}")
    unit_definition.setName(name)

    # Iterate over all units provided for the unit definition & add the units
    for unit in units:
        unit_definition.addUnit(unit)

    return unit_definition



# create default fba units
# ^^^^^^^^^^^^^^^^^^^^^^^^



[docs]
def create_fba_units(model: libModel) -> list[UnitDefinition]:
    """Creates all fba units required for a constraint-based model

    Args:
        - model (libModel):
            Model loaded with libSBML

    Returns:
        list:
            List of libSBML UnitDefinitions
    """
    # Get model level & version for unit & unit definition
    model_specs = model.getLevel(), model.getVersion()

    # Create required units
    litre = create_unit(
        model_specs,
        "litre",
        UNIT_KIND_LITRE,
        e=1,
        m=1,
        s=-3,
        uri_is="0000104",
        uri_idf="0000099",
    )
    mole = create_unit(
        model_specs,
        "mole_0",
        UNIT_KIND_MOLE,
        e=1,
        m=1,
        s=-3,
        uri_is="0000040",
        uri_idf="0000013",
    )
    gram = create_unit(
        model_specs, "per_gram_0", UNIT_KIND_GRAM, e=-1, m=1, s=0, uri_is="0000021"
    )
    second = create_unit(
        model_specs,
        "second_0",
        UNIT_KIND_SECOND,
        e=1,
        m=3600,
        s=0,
        uri_is="0000032",
        uri_idf="0000010",
    )

    # Create unit definitions for hour & femto litre
    hour = create_unit_definition(
        model_specs, identifier="h", name="Hour", units=[second]
    )
    femto_litre = create_unit_definition(
        model_specs, identifier="fL", name="Femto litres", units=[litre]
    )

    # Create unit definitions for millimoles per gram dry weight (mmgdw) & mmgdw per hour
    mmgdw = create_unit_definition(
        model_specs,
        identifier="mmol_per_gDW",
        name="Millimoles per gram (dry weight)",
        units=[mole, gram],
    )

    # Create new units mole & gram to get new meta IDs
    mole = create_unit(
        model_specs,
        "mole_1",
        UNIT_KIND_MOLE,
        e=1,
        m=1,
        s=-3,
        uri_is="0000040",
        uri_idf="0000013",
    )
    gram = create_unit(
        model_specs, "per_gram_1", UNIT_KIND_GRAM, e=-1, m=1, s=0, uri_is="0000021"
    )
    # Create new unit second to fit to per hour & get new meta ID
    second = create_unit(
        model_specs,
        "second_1",
        UNIT_KIND_SECOND,
        e=-1,
        m=3600,
        s=0,
        uri_is="0000032",
        uri_idf="0000010",
    )

    mmgdwh = create_unit_definition(
        model_specs,
        identifier="mmol_per_gDW_per_h",
        name="Millimoles per gram (dry weight) per hour",
        units=[mole, gram, second],
    )
    add_cv_term_units("pubmed:7986045", mmgdwh, BQM_IS_DESCRIBED_BY)

    return [mmgdwh, mmgdw, hour, femto_litre]



# print model entities using libsbml
# ----------------------------------



[docs]
def print_UnitDefinitions(unit_defs: ListOfUnitDefinitions):
    """Prints a list of libSBML UnitDefinitions as XMLNodes

    Args:
        - unit_defs (ListOfUnitDefinitions):
            List of libSBML UnitDefinition objects
    """
    for unit_def in unit_defs:
        logging.info(unit_def.toXMLNode())