Source code for refinegems.classes.gapfill

#!/usr/bin/env python
"""Add reactions, genes and more to a model based on different gap-filling methods.
All (current) algorithms are separated into three steps: finding missing genes, finding missing reactions
and trying to add the found as missing entities to the model.

Available gap filling methods:

- | :py:class:`~refinegems.classes.gapfill.KEGGapFiller`
  | Mainly utilises information from the KEGG database. Needs a KEGG organism ID.
  | Estimated runtime: *to be determined*

- | :py:class:`~refinegems.classes.gapfill.BioCycGapFiller`
  | Mainly utilises information from the BioCyc database. Requires access to BioCyc SmartTables.
  | Estimated runtime: *to be determined*

- | :py:class:`~refinegems.classes.gapfill.GeneGapFiller`
  | Search for gaps using the GFF file and information from SwissProt.
  | Estimated runtime: *to be determined*

"""

__author__ = "Famke Baeuerle, Gwendolyn O. Döbel, Carolin Brune, Finn Mier and Dr. Reihaneh Mostolizadeh"

############################################################################
# requirements
############################################################################

from abc import ABC, abstractmethod

import cobra
from cobra.io.sbml import _f_gene, _f_gene_rev
from copy import deepcopy
import io
import json
import logging
import math
import numpy as np
import os
import pandas as pd
import re
import warnings

from bioservices.kegg import KEGG
from itertools import chain
from libsbml import Model as libModel
from pathlib import Path
from sortedcontainers import SortedSet
from tempfile import NamedTemporaryFile
from typing import Literal, Union

from tqdm import tqdm

tqdm.pandas()

from ..developement.decorators import *
from ..utility.db_access import (
    get_ec_from_ncbi,
    get_ec_via_swissprot,
    parse_KEGG_gene,
    parse_KEGG_ec,
    map_to_homologs,
)
from ..utility.cvterms import PREFIX2DB_GENES
from ..utility.io import (
    load_a_table_from_database,
    parse_gff_for_cds,
    load_model,
    write_model_to_file,
    convert_cobra_to_libsbml,
)
from ..utility.entities import (
    create_gp,
    create_gpr,
    build_reaction_bigg,
    build_reaction_kegg,
    build_reaction_mnx,
    isreaction_complete,
    parse_reac_str,
    validate_reaction_compartment_bigg,
)
from ..utility.databases import mnx_db_namespace
from ..utility.util import insert_into_dict

from .medium import Medium, medium_to_model
from .reports import GapFillerReport

# @Note:
#   some reactions have @DEBUGGING
#   enable these parts to shorten runtime during debugging (used to work on a subset,
#   not on the whole input)

################################################################################
# setup logging
################################################################################

logger = logging.getLogger(__name__)

############################################################################
# variables
############################################################################

DB_REFERENCE_COLS = {
    'ncbigene',
    'ncbiprotein_x',
    'kegg.genes',
    'uniprot',
    'BioCyc',
    'BiGG',
    'MetaNetX',
} #: :meta hide-value:

DBEQ2EQ = {
    "BiGG": "reaction_string", 
    "MetaNetX": "mnx_equation",
} #: :meta hide-value:

############################################################################
# functions
############################################################################

# Cleaning up references after mapping
# ------------------------------------

[docs]
def _clean_table_after_mapping(mapped_table: pd.DataFrame, entity_type: Literal["reaction", "gene"] = "reaction") -> pd.DataFrame:
    """Clean a table containing mapping results for different databases

    Args:
        - mapped_table (pd.DataFrame): 
            Table containinfg a mapping for different databases
        - entity_type (Literal['reaction', 'gene'], optional):
            Type of entity the mapping refers to.
            Defaults to 'reaction'.

    Returns:
        pd.DataFrame: 
            The cleaned table.
    """

    def _clean_ref_row(
        ref_row: pd.Series, ref_dbs: set, entity_type: Literal["reaction", "gene"]
    ) -> pd.Series:
        """Clean a row of a table containing mapping results for different databases

        Args:
            - ref_row (pd.Series):
                Row containing a mapping for different databases
            - ref_db (set):
                Set of column name(s) for reference databases
            - entity_type (Literal['reaction', 'gene'], optional):
                Type of entity the mapping refers to.

        Returns:
            pd.Series:
                The cleaned row.
        """
        references = {} # Initialise references dictionary
        # Iterate over possible database columns
        for db in ref_dbs:
            # Get list of mapped to database IDs
            id_list = ref_row[db].replace(' ', '').split(',') if isinstance(ref_row[db], str) else ref_row[db]
            if id_list:

                match entity_type:
                    case "gene":
                        db = db.removesuffix('_x') # Remove suffix if necessary
                        id_set = set(id_list)
                        # Divide into RefSeq & NCBI Protein IDs if in column ncbiprotein
                        if db == 'ncbiprotein':
                            split_id_set = {
                                PREFIX2DB_GENES['refseq']: set(),
                                PREFIX2DB_GENES[db]: set()
                                } # Initialise sets for RefSeq & NCBI Protein IDs
                            for i in id_set:
                                if re.fullmatch(
                                            r"^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|WP|XM|XP|XR|YP|ZP)_\d+)|(NZ_[A-Z]{2,4}\d+))(\.\d+)?$",
                                            i,
                                            re.IGNORECASE,
                                        ):
                                    split_id_set[PREFIX2DB_GENES['refseq']].add(i)
                                elif re.fullmatch(r"^(\w+\d+(\.\d+)?)|(NP_\d+)$", i, re.IGNORECASE):
                                    split_id_set[PREFIX2DB_GENES[db]].add(i)
                                
                                references.update((k,v) for k,v in split_id_set.items() if len(v)) # Add IDs to references
                        else:
                            references[PREFIX2DB_GENES[db]] = set(id_list)
                    case "reaction":
                        # Move BioCyc IDs to references if BioCyc IDs are present
                        biocyc_reac_id = ref_row["id"]
                        ref_row["id"] = str(id_list[0]) # Ensure ID is a string

                        # Remove ID from column 'id' from list in column 'alias'
                        if len(id_list) != 1:
                            id_list.remove(ref_row["id"])
                            alias = id_list
                        else:
                            alias = None

                        # Set-up references
                        references["metacyc.reaction"] = biocyc_reac_id # Move BioCyc IDs and alias to references
                        references["alias"] = alias
                        ref_row["equation"] = ref_row[DBEQ2EQ.get(db)] # Move equation from other database to equation column
                        ref_row["via"] = db # Replace BioCyc in via column with mapped to database
                    case _:
                        raise ValueError(f"Unknown entity_type {entity_type} in subfunction _clean_ref_row from _clean_table_after_mapping.")

        # Add all reference columns to references
        ref_row["reference"] = references

        return ref_row

    # Get union between DB_REFERENCE_COLS and columns in table
    contained_cols = set(mapped_table.columns).intersection(DB_REFERENCE_COLS)

    # If no columns found, return table as is
    if not contained_cols: return mapped_table

    # Otherwise iterate over the columns
    cleaned_table = mapped_table.apply(_clean_ref_row, args=(contained_cols,entity_type), axis=1)
    cleaned_table.drop(columns=contained_cols, inplace=True) # Drop database columns

    return cleaned_table



# Mapping for BioCyc Reactions
# ----------------------------

[docs]
def map_biocyc_to_reac(
    biocyc_reacs: pd.DataFrame, use_MNX: bool = True, use_BiGG: bool = True
) -> pd.DataFrame:
    """Based on a table containing BioCyc reactions,
    map them to reactions in other databases (if a mapping is possible)

    Args:
        - biocyc_reacs (pd.DataFrame):
            Table containing BioCyc reactions information.
            Should contain the columns: Reaction | Object ID | EC-Number | Spontaneous?
            Can be doenloaded as a SmartTable from BioCyc.
        - use_MNX (bool, optional):
            Try mapping using the MetaNetX database.
            Defaults to True.
        - use_BiGG (bool, optional):
            Try mapping using the BiGG database.
            Defaults to True.

    Returns:
        pd.DataFrame:
            The extended table.
    """

    def _map_biocyc_to_mnx(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
        """
        | Helper function for :py:func:`~refinegems.classes.gapfill.map_biocyc_to_reac`
        | Maps Biocyc IDs in a table to the corresponding MetaNetX IDs &
        | Adds information obtained via the MetaNetX IDs to the resulting table

        Args:
            - unmapped_reacs (pd.DataFrame):
                Table containing BioCyc IDs

        Returns:
            pd.DataFrame:
                The extended table.
        """

        # Step 1: Mapping & Obtain information
        # ------------------------------------
        # Mapping to MetaNetX + addition of more information on MetaNetX
        # Reactions
        # Get MetaNetX BioCyc
        mnx2biocyc_reacs = load_a_table_from_database(
            """
            SELECT x.source, x.id, p.mnx_equation, p.\'ec-code\' 
            FROM mnx_reac_xref x INNER JOIN mnx_reac_prop p 
            USING(id) 
            WHERE x.source LIKE \'%metacyc%\' 
            OR x.source LIKE \'%biocyc%\'
            """
        )

        # Change column names to fit input table
        mnx2biocyc_reacs.rename(
            {"id": "MetaNetX", "source": "id"}, axis=1, inplace=True
        )

        # Transform id column to only contain BioCyc/MetaCyc IDs
        mnx2biocyc_reacs["id"] = mnx2biocyc_reacs["id"].str.split(":", n=1).str[-1]

        # Crop table to contain MetaNetX IDs set per BioCyc ID
        mnx_as_list = (
            mnx2biocyc_reacs.groupby("id")["MetaNetX"]
            .apply(SortedSet)
            .reset_index(name="MetaNetX")
        )
        mnx2biocyc_reacs.drop("MetaNetX", axis=1, inplace=True)
        mnx2biocyc_reacs = mnx_as_list.merge(mnx2biocyc_reacs, on="id")

        # Drop duplicates to get the unique BioCyc IDs
        mnx2biocyc_reacs.drop_duplicates(subset="id", inplace=True)

        # Merge mnx2biocyc_reacs with unmapped_reacs to get new information
        reacs_mapped = mnx2biocyc_reacs.merge(unmapped_reacs, on="id", how="right")

        # Step 2: Clean-up
        # ----------------
        # Remove all NaNs from DataFrame
        reacs_mapped.replace(np.nan, None, inplace=True)

        # Turn MetaNetX column in single value, rename column to id
        # & if multiple MetaNetX IDs exist add to column alias
        reacs_mapped = _clean_table_after_mapping(reacs_mapped, 'reaction')

        # Create list of EC codes in column ec-code_x,
        # Join both ec-code columns into one & Create a set of ec-codes
        reacs_mapped["ec-code_x"] = reacs_mapped["ec-code_x"].str.split(r"\s*;\s*")
        reacs_mapped["ec-code_x"] = (
            reacs_mapped["ec-code_x"]
            .fillna({i: [] for i in reacs_mapped.index})
            .map(set)
            .map(list)
        )
        reacs_mapped["ec-code"] = (
            (reacs_mapped["ec-code_x"] + reacs_mapped["ec-code_y"]).map(set).map(list)
        )

        # Drop all unnecessary columns
        reacs_mapped.drop(
            ["mnx_equation", "ec-code_x", "ec-code_y"], axis=1, inplace=True
        )

        # Step 3: Return result
        # ---------------------
        return reacs_mapped

    def _map_biocyc_to_bigg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
        """
        | Helper function for :py:func:`~refinegems.classes.gapfill.map_biocyc_to_reac`
        | Maps Biocyc IDs in a table to the corresponding BiGG IDs &
        | Adds information obtained via the BiGG IDs to the resulting table

        Args:
            - unmapped_reacs (pd.DataFrame):
                Table containing BioCyc IDs

        Returns:
            pd.DataFrame:
                The extended table.
        """

        # Step 1: Mapping & Obtain information
        # ------------------------------------
        # Mapping to BiGG + addition of more information on BiGG Reactions
        # Get BiGG BioCyc
        bigg2biocyc_reacs = load_a_table_from_database(
            "SELECT id, BioCyc, name, reaction_string from bigg_reactions"
        )

        # Filter BiGG table to contain only reaction IDs with valid compartments
        mask = bigg2biocyc_reacs.apply(
            lambda x: validate_reaction_compartment_bigg(
                parse_reac_str(x["reaction_string"], "BiGG")[2]
            ),
            axis=1,
        )
        mask.replace("exchange", True, inplace=True)
        bigg2biocyc_reacs = bigg2biocyc_reacs[mask]

        # Change column names to fit input table
        bigg2biocyc_reacs.rename({"id": "BiGG", "BioCyc": "id"}, axis=1, inplace=True)

        # Crop table to contain BiGG IDs set per BioCyc ID
        bigg_as_list = (
            bigg2biocyc_reacs.groupby("id")["BiGG"].apply(SortedSet).reset_index(name="BiGG")
        )
        bigg2biocyc_reacs.drop("BiGG", axis=1, inplace=True)
        bigg2biocyc_reacs = bigg_as_list.merge(bigg2biocyc_reacs, on="id")

        # Drop duplicates to get the unique BioCyc IDs
        bigg2biocyc_reacs.drop_duplicates(subset="id", inplace=True)

        # Merge bigg2biocyc_reacs with unmapped_reacs to get new information
        reacs_mapped = bigg2biocyc_reacs.merge(unmapped_reacs, on="id", how="right")

        # Step 2: Clean-up
        # ----------------
        # Remove all NaNs from DataFrame
        reacs_mapped.replace(np.nan, None, inplace=True)

        # Turn BiGG column in single value, rename column to id
        # & if multiple BiGG IDs exist add to column alias
        reacs_mapped = _clean_table_after_mapping(reacs_mapped, 'reaction')

        # Drop all unnecessary columns
        reacs_mapped.drop(["reaction_string", "name"], axis=1, inplace=True)

        # Step 3: Return result
        # ---------------------
        return reacs_mapped

    # Step 1: Get missing reactions without GPR
    # -----------------------------------------
    mask = biocyc_reacs["add_to_GPR"].isna()
    biocyc_reacs_to_map = biocyc_reacs[mask]

    # Step 2: Mapping
    # ---------------
    # @DISCUSSION Rewrite to use _clean_table_after_mapping only once? Add BiGG & MetaNetX columns if both True?
    # Map to MetaNetX
    if use_MNX:
        biocyc_reacs_to_map = _map_biocyc_to_mnx(biocyc_reacs_to_map)

    # Map to BiGG
    if use_BiGG:
        to_map = biocyc_reacs_to_map[biocyc_reacs_to_map["via"] == "BioCyc"]
        if len(to_map) > 0:
            to_map = _map_biocyc_to_bigg(to_map)
            biocyc_reacs_to_map = pd.concat(
                [to_map, biocyc_reacs_to_map[~(biocyc_reacs_to_map["via"] == "BioCyc")]]
            )

    # Step 3: Gather result(s)
    # ------------------------
    # Merge all tables if necessary
    biocyc_reacs = pd.concat(
        [biocyc_reacs_to_map, biocyc_reacs[~biocyc_reacs["add_to_GPR"].isna()]],
        sort=True,
        ignore_index=True,
    )

    # Return results
    return biocyc_reacs



# Mapping of EC numbers
# ---------------------



[docs]
def map_ec_to_reac(
    table: pd.DataFrame,
    use_MNX: bool = True,
    use_BiGG: bool = True,
    use_KEGG: bool = True,
    threshold_add_reacs: int = 5,
) -> pd.DataFrame:
    """Based on a table of NCBI protein IDs and EC numbers,
    map them to reactions via different databases
    (if a mapping is possible).

    input table should have format:
        ``ec-code | ncbiprotein``

    output table has the format:
        ``ec-code | ncbiprotein | id | equation | reference | is_transport | via``

    Args:
        - table (pd.DataFrame):
            The input table.
        - use_MNX (bool, optional):
            Try mapping using the MetaNetX database.
            Defaults to True.
        - use_BiGG (bool, optional):
            Try mapping using the BiGG database.
            Defaults to True.
        - use_KEGG (bool, optional):
            Try mapping using the KEGG database.
            Defaults to True.
        - threshold_add_reacs (int, optional):
            Maximum number of reactions allowed per EC number per ncbiprotein ID
            to be added. Otherwise skip addition of reactions due to insufficient evidence
            Defaults to 5.

    Returns:
        pd.DataFrame:
            The extended table.
    """

    def _map_ec_to_reac_mnx(
        unmapped_reacs: pd.DataFrame, threshold_add_reacs: int = 5
    ) -> pd.DataFrame:
        """Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
        for the mapping using the MetaNetX database.

        Args:
            - unmapped_reacs (pd.DataFrame):
                The input table (ec-code and ncbiprotein columns)
            - threshold_add_reacs (int, optional):
                Maximum number of reactions allowed per EC number per ncbiprotein ID
                to be added. Otherwise skip addition of reactions due to insufficient evidence
                Defaults to 5.

        Returns:
            pd.DataFrame:
                The extended table
        """

        # input: pd.DataFrame with at least the ec-code column
        # load MNX reac prop table
        mnx_reac_prop = load_a_table_from_database("mnx_reac_prop", False)
        # convert table into one EC-number per row
        mnx_reac_prop.drop("is_balanced", inplace=True, axis=1)
        mnx_reac_prop["ec-code"] = mnx_reac_prop["ec-code"].apply(
            lambda x: x.split(r";") if isinstance(x, str) else None
        )
        # exclude entries without EC-number
        mnx_reac_prop = mnx_reac_prop.explode("ec-code").dropna(subset="ec-code")
        # merge with unmapped reactions
        reacs_mapped = unmapped_reacs.merge(mnx_reac_prop, on="ec-code", how="left")
        # filter out mappings with more than x merged reactions, to ensure quality
        reacs_mapped = reacs_mapped.explode("ncbiprotein")
        reacs_mapped = reacs_mapped.groupby(["ncbiprotein", "ec-code"]).filter(
            lambda x: len(x) < threshold_add_reacs
        )
        reacs_mapped = reacs_mapped.merge(
            unmapped_reacs.explode("ncbiprotein"),
            on=["ec-code", "ncbiprotein"],
            how="outer",
        )
        # rename columns and cleanup
        reacs_mapped.rename({"mnx_equation": "equation"}, inplace=True, axis=1)
        reacs_mapped = (
            reacs_mapped.groupby(["ec-code", "id"], dropna=False)
            .agg(
                {
                    "ncbiprotein": lambda x: x.tolist(),
                    "equation": "first",
                    "reference": "first",
                    "is_transport": "first",
                }
            )
            .reset_index()
        )
        reacs_mapped = reacs_mapped.reindex(
            columns=[
                "ec-code",
                "ncbiprotein",
                "id",
                "equation",
                "reference",
                "is_transport",
            ]
        )
        reacs_mapped["via"] = reacs_mapped["id"].apply(
            lambda x: "MetaNetX" if x else None
        )

        return reacs_mapped

    def _map_ec_to_reac_bigg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
        """Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
        for the mapping using the BiGG database.

        Args:
            - unmapped_reacs (pd.DataFrame):
                The input table (ec-code and ncbiprotein columns)

        Returns:
            pd.DataFrame:
                The extended table
        """

        # load BiGG reaction namespace
        bigg_reacs = load_a_table_from_database("bigg_reactions", False)
        bigg_reacs.dropna(subset="EC Number", inplace=True)
        bigg_reacs = bigg_reacs[["id", "reaction_string", "EC Number"]].rename(
            {"reaction_string": "equation", "EC Number": "ec-code"},
            inplace=False,
            axis=1,
        )
        bigg_reacs["ec-code"] = bigg_reacs["ec-code"].apply(
            lambda x: x.split(r",") if isinstance(x, str) else None
        )
        bigg_reacs = bigg_reacs.explode("ec-code")
        # merge with unmapped reactions
        bigg_mapping = unmapped_reacs.merge(bigg_reacs, on=["ec-code"], how="left")
        bigg_mapping.mask(bigg_mapping.isna(), other=None, inplace=True)
        # make conform to format
        bigg_mapping["reference"] = None
        bigg_mapping["is_transport"] = None
        bigg_mapping["via"] = bigg_mapping["id"].apply(lambda x: "BiGG" if x else None)

        return bigg_mapping

    def _map_ec_to_reac_kegg(unmapped_reacs: pd.DataFrame) -> pd.DataFrame:
        """Helper function of :py:func:`~refinegems.classes.gapfill.map_ec_to_reac`
        for the mapping using the KEGG database.

        Args:
            - unmapped_reacs (pd.DataFrame):
                The input table (ec-code and ncbiprotein columns)

        Returns:
            pd.DataFrame:
                The extended table
        """

        # get KEGG EC number information
        eccodes = unmapped_reacs["ec-code"]
        kegg_mapped = pd.DataFrame.from_dict(
            list(eccodes.progress_apply(parse_KEGG_ec))
        )
        kegg_mapped = unmapped_reacs.merge(kegg_mapped, on="ec-code")

        kegg_mapped["is_transport"] = None
        kegg_mapped["via"] = kegg_mapped["id"].apply(lambda x: "KEGG" if x else None)
        kegg_mapped.explode(column="id")

        return kegg_mapped

    # input table should have format:
    #   ec-code | ncbiprotein
    #   one EC number per row, list of ncbiprotein per row allowed
    if (
        len(table.columns) != 2
        or "ec-code" not in table.columns
        or "ncbiprotein" not in table.columns
    ):
        raise ValueError("Wrong table format. Cannot map EC to reaction.")

    # map to MetaNetX
    if use_MNX:
        table = _map_ec_to_reac_mnx(table, threshold_add_reacs)
    # map to BiGG
    if use_BiGG:
        if "id" in table.columns:
            to_map = table[table["id"].isna()][["ec-code", "ncbiprotein"]]
            if len(to_map) > 0:
                to_map = _map_ec_to_reac_bigg(to_map)
                table = pd.concat([to_map, table[~table["id"].isna()]])
        else:
            table = _map_ec_to_reac_bigg(table)

    # map to KEGG
    if use_KEGG:
        if "id" in table.columns:
            to_map = table[table["id"].isna()][["ec-code", "ncbiprotein"]]
            if len(to_map) > 0:
                to_map = _map_ec_to_reac_kegg(to_map)
                table = pd.concat([to_map, table[~table["id"].isna()]])
        else:
            table = _map_ec_to_reac_kegg(table)
    # explode
    table = table.explode("id", ignore_index=True).explode("id", ignore_index=True)

    # output:   ec-code ncbiprotein	id	equation	reference	is_transport	via
    return table



############################################################################
# classes
############################################################################

# -------------
# abtract class
# -------------



[docs]
class GapFiller(ABC):
    """Abstract base class for the gap filling.

    Already includes functions for the "filling" part of the gap-filling approach
    and some helper functions. Each subclass needs an implementation of `find_missing_genes`
    and `find_missing_reactions` to determine the entities, that are missing in the model.

    Attributes:
        - full_gene_list (list):
            List of all the genes.
        - missing_genes:
            DataFrame containing all genes found as missing with additional information. <br>
            ``ncbiprotein | locus_tag | <optional columns>``
        - missing_reactions:
            DataFrame containing all reactions found as missing with additional information.<br>
            ``ec-code | ncbiprotein | id | equation | reference | <is_transport> | via | add_to_GPR``
        - geneid_type (str):
            What type of gene ID the model contains.
            Defaults to 'ncbi'.
        - _statistics (dict):
            Dictionary of statistical information of the gap-filling run. Includes e.g.
            the number of added genes and reactions.
        - manual_curation (dict):
            Dictionary of reaction and gene IDs to be used for manual curation.
    """


[docs]
    def __init__(self) -> None:

        # data
        self.__dict__["full_gene_list"] = None # self.full_gene_list = None
        self.missing_genes = (
            None  # missing genes, that have not yet been sorted into any category
        )
        self.missing_reactions = (
            None  # missing reacs, that have not yet been sorted into any category
        )

        # general information
        self.geneid_type = "ncbi"
        self._variety = "Undefined"  # Specifies the variety of the gapfiller, e.g. 'BioCyc', 'KEGG', 'GFF'

        # collect stats & Co, can be extended by subclasses
        self._statistics = {
            "genes": {
                "missing (total)": 0,
                "unmappable": 0,
                "missing (mappable)": 0,
                "duplicated NCBI Protein ID": 0,
                "duplicated locus tag": 0,
                "duplicated": 0,
                "added": 0,
                "building failed": 0,
                "missing (remaining)": 0,
            },
            "reactions": {
                "missing (based on genes)": 0,
                "already in model": 0,
                "missing (total)": 0,
                "mapped to MNX": 0,
                "mapped to BiGG": 0,
                "mapped to KEGG": 0,
                "unmappable": 0,
                "added": 0,
                "building failed": 0,
                "missing (remaining)": 0,
            },
        }
        self.manual_curation = {"genes": {}, "reactions": {}}
        
        # check, if metanetx table is in database, 
        for t in mnx_db_namespace.keys():
            q = f"SELECT name FROM sqlite_master WHERE type='table' AND name='mnx_{t}'"
            if load_a_table_from_database(q, True).empty:
                mes = f"MetaNetX table mnx_{t} not found in database. Filling part will most likely fail."
                warnings.warn(mes, UserWarning)

        # @IDEA maybe directly install the missing stuff?

    # abstract methods
    # ----------------


[docs]
    @abstractmethod
    def find_missing_genes(self, model):
        """Find missing genes in the model. Parameters can be extended as needed.

        Needs to save a table in the format

        ``ncbiprotein | locus_tag | <optional columns>``

        to the attribute missing_genes.

        """
        pass



[docs]
    @abstractmethod
    def find_missing_reactions(self, model):
        """Find missing reactions in the model. Parameters can be extended as needed.

        Needs to save a table of the format

        ``ec-code | ncbiprotein | id | equation | reference | <is_transport> | via | add_to_GPR``

        to the attribute missing_reactions.

        Method specific information can be added to the reference column, which is
        expected to contain a dictionary. The 'via' column describes the way
        the database will be added to the model.
        """
        pass


    # finding the gaps
    # ----------------

[docs]
    def _find_reac_in_model(
        self,
        model: cobra.Model,
        eccode: str,
        id: str,
        idtype: Literal["MetaNetX", "KEGG", "BiGG", "BioCyc"],
        include_ec_match: bool = False,
    ) -> Union[None, list]:
        """Helper function of :py:class:`~refinegems.classes.gapfill.GapFiller`.
        Search the model for an ID (and optionally EC number), to determine, if the
        reaction is in the model.

        Args:
            - model (cobra.Model):
                The model loaded with COBRApy.
            - eccode (str):
                The EC number in the format: X.X.X.X
            - id (str):
                The ID to search for.
            - idtype (Literal['MetaNetX','KEGG','BiGG', 'BioCyc']):
                Name of the database the ID belongs to.
            - include_ec_match (bool, optional):
                Option to include a match if only the EC number matches.
                Defaults to False.

        Returns:
            (1) Case one or more match found:
                    list:
                        List of the ID of the reactions in the model, that match
                        the query.

            (2) Case no match:
                    None:
                        Nothing found.
        """

        MAPPING = {
            "MetaNetX": "metanetx.reaction",
            "KEGG": "kegg.reaction",
            "BiGG": "bigg.reaction",
            "BioCyc": "metacyc.reaction",
        }

        found = []
        for r in model.reactions:
            if MAPPING[idtype] in r.annotation.keys():
                if (
                    isinstance(r.annotation[MAPPING[idtype]], list)
                    and id in r.annotation[MAPPING[idtype]]
                ):
                    found.append(r.id)
                elif (
                    isinstance(r.annotation[MAPPING[idtype]], str)
                    and id == r.annotation[MAPPING[idtype]]
                ):
                    found.append(r.id)
            if include_ec_match and eccode and "ec-code" in r.annotation.keys():
                if (
                    isinstance(r.annotation["ec-code"], list)
                    and eccode in r.annotation["ec-code"]
                ):
                    found.append(r.id)
                elif (
                    isinstance(r.annotation["ec-code"], str)
                    and eccode == r.annotation["ec-code"]
                ):
                    found.append(r.id)

        found = list(set(found))

        if len(found) > 0:
            return found

        return None


    # actual "Filling" part
    # ---------------------

[docs]
    def add_genes_from_table(self, model: libModel, gene_table: pd.DataFrame) -> None:
        """Create new GeneProduct for a table of genes in the format:

        | ncbiprotein | locus_tag | reference | ... |

        The dots symbolise additional columns, that can be passed to the function,
        but will not be used by it. The other columns are required.

        Args:
            - model (libModel):
                The model loaded with libSBML.
            - gene_table (pd.DataFrame):
                The table with the genes to add. At least needs the columns
                *ncbiprotein*, *locus_tag* and *reference*. More columns can be provided.
        """

        # ncbiprotein | locus_tag | reference | ... 
        # work on a copy to ensure input stays the same
        gene_table = gene_table.copy()
        # gene_table.drop(columns=['ec-code'],inplace=True)

        # create gp
        for idx, x in tqdm(
            gene_table.iterrows(), total=len(gene_table), desc="Adding genes to model"
        ):
            create_gp(
                model, x["ncbiprotein"], locus_tag=x["locus_tag"], reference=x['reference']
            )
            self._statistics["genes"]["added"] += 1



[docs]
    def add_gene_reac_associations_from_table(
        self, model: libModel, reac_table: pd.DataFrame
    ) -> None:
        """Using a table with at least the columns 'ncbiprotein'
        (containing e.g. NCBI protein identifier (lists), should be gene IDs in the model)
        and 'add_to_GPR' (containing reactions identifier (lists)), add the gene IDs to the
        GPRs of the corresponding reactions.

        Args:
            - model (libModel):
                The model loaded with libSBML.
            - reac_table (pd.DataFrame):
                The table containing at least the columns 'ncbiprotein' (gene IDs) and
                'add_to_GPR' (reaction IDs)
        """

        model_gene_ids = [_.getId() for _ in model.getPlugin(0).getListOfGeneProducts()]

        # get each unique ncbiprotein vs reaction(s) mapping
        reac_table = reac_table[["ncbiprotein", "add_to_GPR"]]
        reac_table = reac_table.explode("ncbiprotein")
        reac_table = (reac_table.explode('add_to_GPR').groupby('ncbiprotein').agg(list).reset_index())
        reac_table.drop_duplicates(subset="ncbiprotein", inplace=True)

        # add the genes to the corresponding GPRs
        for idx, row in reac_table.iterrows():
            # check, if G_+ncbiprotein in model
            # if yes, add gpr
            geneid = _f_gene_rev(row["ncbiprotein"])
            for reacid in row["add_to_GPR"]:
                current_reacid = "R_" + reacid
                current_mgids = [mgid for mgid in model_gene_ids if geneid in mgid]
                if current_mgids:
                    if len(current_mgids) == 1:
                        create_gpr(model.getReaction(current_reacid), current_mgids[0])
                    else:
                        mes = f"Found multiple matches for {geneid} in model: {current_mgids}. Belongs to reaction {current_reacid}."
                        logger.warning(mes)
                # else, print warning
                else:
                    mes = f"Cannot find {geneid} in model. Should be added to {current_reacid}."
                    logger.warning(mes)



[docs]
    def add_reactions_from_table(
        self,
        model: cobra.Model,
        missing_reac_table: pd.DataFrame,
        formula_check: Literal["none", "existence", "wildcard", "strict"] = "existence",
        exclude_dna: bool = True,
        exclude_rna: bool = True,
        idprefix: str = "refineGEMs",
        namespace: Literal["BiGG"] = "BiGG",
    ) -> pd.DataFrame:
        """Helper function to add reactions to a model from the missing_reactions table
        (output of the chosen implementation of :py:meth:`~refinegems.classes.gapfill.GapFiller.find_missing_reactions`)

        Args:
            - model (cobra.Model):
                The model, loaded with COBRpy.
            - missing_reac_table (pd.DataFrame):
                The missing reactions table.
            - formula_check (Literal['none','existence','wildcard','strict'], optional):
                Param for checking metabolite formula before adding them to the model.
                For more information, refer to :py:func:`~refinegems.utility.entities.isreaction_complete`
                Defaults to 'existence'.
            - exclude_dna (bool, optional):
                Option to exclude reaction containing 'DNA' from being added to the model.
                Defaults to True.
            - exclude_rna (bool, optional):
                Option to exclude reaction containing 'RNA' from being added to the model.
                Defaults to True.
            - idprefix (str, optional):
                A prefix to use, if pseudo-IDs need to be created.
                Defaults to 'refineGEMs'.
            - namespace (Literal['BiGG'], optional):
                Namespace to use for the reactions and metabolites
                (and the model). Defaults to 'BiGG'.

        Raises:
            - TypeError: Unknown return type for reac param. Please contact the developers.

        Returns:
            pd.DataFrame:
                Table containing the information about which genes can now be added
                to reactions (use for GPR curation).
        """

        # reconstruct reactions
        # ---------------------
        for idx, row in tqdm(
            missing_reac_table.iterrows(),
            desc="Trying to add missing reacs",
            total=missing_reac_table.shape[0],
        ):
            # add EC number to references
            if row["reference"]:
                refs = row["reference"]
                if isinstance(refs, dict):
                    pass
                elif refs[0] == "{":
                    refs = refs.replace(r"'", r"\"")
                    refs = json.loads(refs)
                else:
                    refs = refs.split(r":")
                    refs = {refs[0]: refs[1]}
            else:
                refs = {}

            if row["ec-code"]:
                if "ec-code" in refs.keys():
                    if not isinstance(refs["ec-code"], list):
                        refs["ec-code"] = [refs["ec-code"]]
                    if isinstance(row["ec-code"], list):
                        refs["ec-code"] = list(set(refs["ec-code"] + row["ec-code"]))
                    elif isinstance(row["ec-code"], str):
                        refs["ec-code"].append(row["ec-code"])
                        refs["ec-code"] = list(set(refs["ec-code"]))
                    else:
                        warnings.warn(
                            f'Unknown type for ec-code: {type(row["ec-code"])}',
                            UserWarning,
                        )
                else:
                    refs["ec-code"] = (
                        row["ec-code"]
                        if isinstance(row["ec-code"], list)
                        else [row["ec-code"]]
                    )

            # build reaction
            reac = None
            match row["via"]:
                # MetaNetX
                case "MetaNetX":
                    reac = build_reaction_mnx(
                        model,
                        row["id"],
                        reac_str=str(row["equation"]),
                        references=refs,
                        idprefix=idprefix,
                        namespace=namespace,
                    )
                # KEGG
                case "KEGG":
                    reac = build_reaction_kegg(
                        model,
                        row["id"],
                        reac_str=str(row["equation"]),
                        references=refs,
                        idprefix=idprefix,
                        namespace=namespace,
                    )
                # BiGG
                case "BiGG":
                    reac = build_reaction_bigg(
                        model,
                        row["id"],
                        references=refs,
                        idprefix=idprefix,
                        namespace=namespace,
                    )

                # Unknown database
                case _:
                    mes = f"""Unknown database name for reaction reconstruction: {row["via"]}\n
                    Reaction will not be reconstructed."""
                    warnings.warn(mes, UserWarning)

            # check output of reconstruction
            # ------------------------------
            # case 1: reconstruction was not possible
            if not reac:
                self._statistics["reactions"]["building failed"] += 1
                pass  # nothing to do here
            # case 2: reaction(s) found in model
            elif isinstance(reac, list):
                # add found names to the add_to_GPR column of the table
                current_gpr = missing_reac_table.loc[idx, "add_to_GPR"]
                if str(current_gpr) == "nan":
                    current_gpr = None
                if not current_gpr:
                    missing_reac_table.at[idx, "add_to_GPR"] = reac
                else:
                    missing_reac_table.at[idx, "add_to_GPR"] = list(
                        set(reac + current_gpr)
                    )
            # case 3: new reaction was generated
            elif isinstance(reac, cobra.Reaction):
                # validate reaction
                if isreaction_complete(
                    reac,
                    formula_check=formula_check,
                    exclude_dna=exclude_dna,
                    exclude_rna=exclude_rna,
                ):
                    # Extend reaction notes with information about the GapFiller
                    reac.notes["found with"] = f"refineGEMs GapFiller, {self._variety}"
                    # add reaction to model (if validation successful)
                    model.add_reactions([reac])
                    self._statistics["reactions"]["added"] += 1
                    # add reaction ID to table under add_to_GPR
                    current_gpr = missing_reac_table.loc[idx, "add_to_GPR"]
                    if pd.isna(current_gpr):
                        missing_reac_table.at[idx, "add_to_GPR"] = [reac.id]
                    else:
                        current_gpr.append(reac.id)
                        missing_reac_table.at[idx, "add_to_GPR"] = list(
                            set(current_gpr)
                        )

            # case 4: should never occur
            else:
                mes = f"Unknown return type for reac param. Please contact the developers."
                raise TypeError(mes)

        # save reactions, that could not be recontructed, for manual curation
        manual_curation_reacs = missing_reac_table[
            missing_reac_table["add_to_GPR"].isna()
        ]
        self.manual_curation["reactions"]["building failed"] = manual_curation_reacs
        self._statistics["reactions"]["building failed"] = manual_curation_reacs[
            "id"
        ].nunique()
        # return the updated table with successfully reconstructed reaction ids
        # to enable adding the genes
        missing_gprs = missing_reac_table[~missing_reac_table["add_to_GPR"].isna()]
        return missing_gprs



[docs]
    def fill_model(self, model: Union[cobra.Model, libModel], **kwargs) -> libModel:
        """Based on a table of missing genes and missing reactions,
        fill the gaps in a model as good as possible automatically.

        .. note::

            This model rewrites and reloads the input model. Only the returned model
            has all the edits.

        Args:
            - model (Union[cobra.Model,libModel]):
                The model, either a libSBML or COBRApy model entity.
            - kwargs:
                Additional parameters to be passed to
                :py:meth:`~refinegems.classes.gapfill.GapFiller.add_reactions_from_table`.

        Raises:
            - TypeError: Unknown type of model.

        Returns:
            libModel:
                The gap-filled model.
        """
        # Step 0: Preparations
        # --------------------
        # load the correct type of model for the first step
        match model:
            case cobra.Model():
                model = convert_cobra_to_libsbml(model, 'notes')
            case libModel():
                pass
            case _:
                mes = f"Unknown type of model: {type(model)}"
                raise TypeError(mes)

        # Check if missing reactions found, if not return model
        if not isinstance(self.missing_reactions, pd.DataFrame) or self.missing_reactions.empty:
            return model

        # Filter out reactions without ncbiprotein
        self.manual_curation["reactions"]["no GPR"] = self.missing_reactions[
            self.missing_reactions["ncbiprotein"].isna()
        ]
        self._statistics["reactions"]["unmappable"] += self.manual_curation[
            "reactions"
        ]["no GPR"]["id"].nunique()
        self.missing_reactions = self.missing_reactions[
            ~self.missing_reactions["ncbiprotein"].isna()
        ]

        # Filter out genes without ncbiprotein
        self.manual_curation["genes"]["no ncbiprotein"] = self.missing_genes[
            self.missing_genes["ncbiprotein"].isna()
        ]
        num_genes_no_ncbi_id = self.manual_curation["genes"]["no ncbiprotein"][
            "locus_tag"
        ].nunique()
        self._statistics["genes"]["unmappable"] += num_genes_no_ncbi_id
        self._statistics["genes"]["missing (mappable)"] -= num_genes_no_ncbi_id
        self.missing_genes = self.missing_genes[
            ~self.missing_genes["ncbiprotein"].isna()
        ]

        # Filter out genes with duplicated locus tag & NCBI Protein ID
        # to avoid duplicated IDs & labels
        # Remove duplicates
        deduplicated_missing_genes = self.missing_genes.drop_duplicates(subset=["ncbiprotein", "locus_tag"])
        # Save duplicates for manual curation
        self.manual_curation["genes"]["duplicated"] = self.missing_genes[self.missing_genes.duplicated(subset=["ncbiprotein", "locus_tag"], keep='first')]
        # Get statistics
        self._statistics["genes"]["duplicated"] = self.manual_curation["genes"]["duplicated"]["locus_tag"].nunique()
        # Update missing genes
        self.missing_genes = deduplicated_missing_genes

        # filter out duplicated genes based on NCBI Protein IDs to avoid duplicated IDs in the model
        if len(self.missing_genes) != len(self.missing_genes["ncbiprotein"].unique()):
            self.manual_curation["genes"]["duplicated NCBI Protein ID (not added)"] = (
                self.missing_genes[
                    self.missing_genes.duplicated(subset=["ncbiprotein"])
                ]
            )
            self._statistics["genes"]["duplicated NCBI Protein ID"] = self.manual_curation["genes"][
                "duplicated NCBI Protein ID (not added)"
            ]["locus_tag"].nunique()
            self.missing_genes = self.missing_genes[
                ~self.missing_genes.duplicated(subset=["ncbiprotein"])
            ]

        # filter out duplicated genes based on the locus tag to avoid duplicated labels in the model
        if len(self.missing_genes) != len(self.missing_genes["locus_tag"].unique()):
            self.manual_curation["genes"]["duplicated locus tag (not added)"] = (
                self.missing_genes[
                    self.missing_genes.duplicated(subset=["locus_tag"])
                ]
            )
            self._statistics["genes"]["duplicated locus tag"] = self.manual_curation["genes"][
                "duplicated locus tag (not added)"
            ]["locus_tag"].nunique() # @ASK Use something different for the stats here?
            self.missing_genes = self.missing_genes[
                ~self.missing_genes.duplicated(subset=["locus_tag"])
            ]

        # Store mapping table for later manual curation (or debugging purposes)
        self.manual_curation["genes"]["missing, before filling"] = self.missing_genes
        self.manual_curation["reactions"]["missing, before filling"] = self.missing_reactions

        # Step 1: Add genes to model whose reactions are already in it
        # -------------------------------------------------------------
        # filter the respective genes and reactions
        reacs_in_model = self.missing_reactions[
            ~(self.missing_reactions["add_to_GPR"].isna())
        ]
        ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model["ncbiprotein"]))]
        genes_with_reacs_in_model = self.missing_genes[
            self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model)
        ]

        if len(genes_with_reacs_in_model) > 0:
            # add genes as gene products to model
            self.add_genes_from_table(model, genes_with_reacs_in_model)
            # extend gene production rules
            self.add_gene_reac_associations_from_table(model, reacs_in_model)

            # what remains:
            self.missing_reactions = self.missing_reactions[
                self.missing_reactions["add_to_GPR"].isna()
            ]
            self.missing_genes = self.missing_genes[
                ~(self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model))
            ]  #

        # Step 2: Add reactions to model, if reconstruction successful
        # ------------------------------------------------------------

        if len(self.missing_reactions) > 0:
            # re-load model with cobrapy
            with NamedTemporaryFile(suffix=".xml", delete=False) as tmp:
                write_model_to_file(model, tmp.name)
                model = load_model(tmp.name, "cobra")
            os.remove(tmp.name)

            # .......................
            # @DEBUG
            # if len(self.missing_reactions) > 10:
            #     self.missing_reactions = self.missing_reactions.sample(20)
            #     print('fill_model: Running in debugging mode')
            # .......................

            # add reactions to model
            missing_gprs = self.add_reactions_from_table(
                model, self.missing_reactions, **kwargs
            )

        # Step 3: Add GPRs + genes for the newly curated reactions
        # --------------------------------------------------------

        # re-load model with libsbml
        model = convert_cobra_to_libsbml(model, "notes")

        try:
            if len(missing_gprs) > 0:
                # filter for genes for GPRs but not yet in model
                ncbiprot_with_reacs_in_model = [
                    *chain(*list(missing_gprs["ncbiprotein"]))
                ]
                genes_with_reacs_in_model = self.missing_genes[
                    self.missing_genes["ncbiprotein"].isin(ncbiprot_with_reacs_in_model)
                ]
                if len(genes_with_reacs_in_model) > 0:
                    # add genes as gene products to model
                    self.add_genes_from_table(model, genes_with_reacs_in_model)
                    # extend gene production rules
                    reacs_in_model = self.missing_reactions[
                        ~(self.missing_reactions["add_to_GPR"].isna())
                    ]
                    self.add_gene_reac_associations_from_table(model, reacs_in_model)

                    self.missing_genes = self.missing_genes[
                        ~(
                            self.missing_genes["ncbiprotein"].isin(
                                ncbiprot_with_reacs_in_model
                            )
                        )
                    ]
        except NameError:
            warnings.warn("Something went wrong. Contact the developers (gapfillmodel)")

        # collect stats and stuff for manual curation
        self.manual_curation["genes"]["building failed"] = self.missing_genes
        self._statistics["genes"]["building failed"] = self.missing_genes[
            "locus_tag"
        ].nunique()

        # calculate remaining statistics
        self._statistics["reactions"]["missing (remaining)"] += self._statistics[
            "reactions"
        ]["building failed"]
        self._statistics["genes"]["missing (remaining)"] += self._statistics["genes"][
            "building failed"
        ]

        return model



[docs]
    def report(
        self, dir: str, hide_zeros: bool = False, no_title: bool = False
    ) -> None:
        """Based on the previous gap-filling, save statistics and missing genes/reactions for manual curation.

        Args:
            - dir (str):
                Path to a directory to save the report to.
            - hide_zeros (bool, optional):
                Option to hide statistics with zero counts. Defaults to False.
            - with_title (bool, optional):
                Option to get figure without title. Defaults to False.
        """
        logger.warning(
            "Please keep in mind that all statistical values are determined while running the tool "
            + "and only unique values are tracked."
        )

        statistics_report = GapFillerReport(
            self._variety,
            deepcopy(self._statistics),
            self.manual_curation,
            hide_zeros,
            no_title,
        )
        statistics_report.save(Path(dir))




# --------------------
# Gapfilling with KEGG
# --------------------

[docs]
class KEGGapFiller(GapFiller):
    """Based on a KEGG organism ID (corresponding to the organism of the model),
    find missing genes in the model and map them to reactions to try and fill the gaps
    found with the KEGG database.

    .. note::

        Please keep in mind that using this module requires a model containing the Genbank locus tags as labels as these 
        are used in combination with the organism ID to query KEGG. Usually, the KEGG Gene ID consists of the organism 
        ID and the Genbank locus tag and looks like `<organismid>:<locus_tag>`.
        If your model does not conform to this you can either use the function
        :py:func:`~refinegems.curation.curate.polish_model` or the function
        :py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table` in combination with the function
        :py:func:`~refinegems.curation.curate.extend_gp_annots_via_KEGG`.
        
    .. warning:: If the locus tag from Genbank and the locus tag part from the KEGG Gene ID do not match and running the 
        functions above does not solve the issue for your organism, please recheck if all GeneProducts in your model 
        contain valid KEGG Gene IDs in the annotation bag. Otherwise, add these manually to the model.

    .. warning:: 
    
        If the Genbank locus tags are not part of the KEGG Gene ID, please recheck the locus tags added as labels to the 
        newly created GeneProducts after running :py:meth:`~refinegems.classes.gapfill.fill_model`.

    .. note::

        Due to the KEGG REST API this is relatively slow.
        
    .. attention:: 

        A second version of this class is in development, that will enable gap-filling via KEGG
        using an orthologouos strains instead of relying on a direct match in the KEGG database. 

    Attributes:

    - GapFiller Attributes:
        All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
    - organismid (str, required):
        Abbreviation of the organism in the KEGG database.

    """


[docs]
    def __init__(self, organismid: str) -> None:
        super().__init__()
        self._variety = "KEGG"
        self.organismid = organismid



[docs]
    def find_missing_genes(self, model: libModel):
        """Get the missing genes in model in comparison to the KEGG entry of the
        organism. Saves a table containing the missing genes
        to the attribute missing_genes.

        Format:

        ``locus_tag | kegg.orthology | ec-code | ncbiprotein | reference``

        Args:
            - model (libModel):
                The model loaded with libSBML.

        """

        # Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis/entities --- Modified
        def get_model_genes(model: libModel) -> pd.DataFrame:
            """Extracts KEGG Genes from given model

            Args:
                - model (model-libsbml):
                    Model loaded with libSBML

            Returns:
                pd.DataFrame:
                    Table with all KEGG Genes in the model
            """
            genes_in_model = []
            for gene in model.getPlugin(0).getListOfGeneProducts():
                cv_terms = gene.getCVTerms()
                if cv_terms:
                    for cv_term in cv_terms:
                        for idx in range(cv_term.getNumResources()):
                            uri = cv_term.getResourceURI(idx)
                            if "kegg.genes" in uri:
                                genes_in_model.append(
                                    re.split(r"kegg.genes:|kegg.genes/", uri)[1]
                                ) # work with old/new pattern

            return pd.DataFrame(genes_in_model, columns=["kegg.genes"])

        # Step 1: get genes from model
        # ----------------------------
        genes_in_model = get_model_genes(model)

        # Step 2: get genes of organism from KEGG
        # ---------------------------------------
        gene_KEGG_list = KEGG().list(self.organismid)
        gene_KEGG_table = pd.read_table(io.StringIO(gene_KEGG_list), header=None)
        gene_KEGG_table.columns = ["kegg.genes", "CDS", "position", "protein"]
        self.full_gene_list = gene_KEGG_table
        gene_KEGG_table = gene_KEGG_table[["kegg.genes"]]

        # Statistics on full gene list based on KEGG
        self._statistics['genes'] = insert_into_dict(
            self._statistics['genes'], (
                f"total (based on {self._variety})",
                (
                    self.full_gene_list["kegg.genes"].nunique()
                    + int(self.full_gene_list["kegg.genes"].isna().sum())
                    )
            ),
            'missing (total)'
        )

        # Step 3: KEGG vs. model genes -> get missing genes for model
        # ----------------------------
        genes_not_in_model = gene_KEGG_table[
            ~gene_KEGG_table["kegg.genes"].isin(genes_in_model["kegg.genes"])
        ]

        # Step 4: extract locus tag
        # -------------------------
        genes_not_in_model["locus_tag"] = (
            genes_not_in_model["kegg.genes"].str.split(r":").str[1]
        )

        # Step 5: map to EC via KEGG
        # --------------------------
        # @DEBUG .......................
        # genes_not_in_model = genes_not_in_model.iloc[0:50,:]
        # print(UserWarning('Running in debugging mode.'))
        # ..............................
        geneKEGG_mapping = pd.DataFrame.from_dict(
            list(genes_not_in_model["kegg.genes"].progress_apply(parse_KEGG_gene))
        )
        genes_not_in_model = genes_not_in_model.merge(
            geneKEGG_mapping, how="left", on="kegg.genes"
        )
        genes_not_in_model = genes_not_in_model.explode("ncbiprotein")

        # Step 6: Clean-up table to contain a single reference column with the KEGG Gene & UniProt IDs
        # ----------------------
        genes_not_in_model.mask(genes_not_in_model.isna(), other=None, inplace=True)
        genes_not_in_model = _clean_table_after_mapping(genes_not_in_model, 'gene')

        # Step 7: collect stats
        # ---------------------
        self._statistics["genes"]["missing (total)"] = genes_not_in_model[
            "locus_tag"
        ].nunique()

        self.missing_genes = genes_not_in_model



[docs]
    def find_missing_reactions(self, model: cobra.Model, threshold_add_reacs: int = 5):

        # Step 1: filter missing gene list + extract ECs
        # ----------------------------------------------
        # Filter out unmappable genes that have whether NCBI protein IDs nor EC numbers
        mask = (
            self.missing_genes["ec-code"].isna()
            & self.missing_genes["ncbiprotein"].isna()
        )
        self.manual_curation["genes"]["no ncbiprotein, no EC"] = self.missing_genes[
            mask
        ]
        self.missing_genes = self.missing_genes[~mask]
        self._statistics["genes"]["unmappable"] = self.manual_curation["genes"][
            "no ncbiprotein, no EC"
        ]["locus_tag"].nunique()

        # Filter out remaining unmappable genes due to missing EC numbers
        mask = self.missing_genes["ec-code"].isna()
        self.manual_curation["genes"]["no EC"] = self.missing_genes[mask]
        self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
            "no EC"
        ]["locus_tag"].nunique()
        self.missing_genes = self.missing_genes[~mask]

        # collect overall remaining statistics on genes
        self._statistics["genes"]["missing (mappable)"] = self.missing_genes[
            "locus_tag"
        ].nunique()
        self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
            "unmappable"
        ]

        # get relevant infos for reacs
        self.missing_reactions = self.missing_genes[["ec-code", "ncbiprotein"]]

        # check, if any automatic gapfilling is possible
        if self.missing_reactions.empty:
            logger.warning(
                f"No missing reactions for the provided model {model.id} were found via {self._variety}."
            )
            return None
        # transform table into EC-number vs. list of NCBI protein IDs
        eccode = (
            self.missing_reactions["ec-code"]
            .apply(pd.Series)
            .reset_index()
            .melt(id_vars="index")
            .dropna()[["index", "value"]]
            .set_index("index")
        )
        ncbiprot = (
            self.missing_reactions["ncbiprotein"]
            .apply(pd.Series)
            .reset_index()
            .melt(id_vars="index")
            .dropna()[["index", "value"]]
            .set_index("index")
        )
        self.missing_reactions = pd.merge(
            eccode, ncbiprot, left_index=True, right_index=True
        ).rename(columns={"value_x": "ec-code", "value_y": "ncbiprotein"})
        self.missing_reactions = (
            self.missing_reactions.groupby(self.missing_reactions["ec-code"])
            .aggregate({"ncbiprotein": "unique"})
            .reset_index()
        )

        # @DEBUG .......................
        # self.missing_reactions = self.missing_reactions.iloc[10:30,:]
        # print(UserWarning('Running in debugging mode.'))
        # ..............................

        # Step 2: map EC to reaction(s) if possible
        # -----------------------------------------
        # via MNX, BiGG, KEGG
        reacs_mapped = map_ec_to_reac(self.missing_reactions, threshold_add_reacs)

        # Statistics on missing based on genes
        self._statistics["reactions"]["missing (based on genes)"] = reacs_mapped[
            "id"
        ].nunique() + int(reacs_mapped["id"].isna().sum())

        # Step 3: clean and map to model reactions
        # ----------------------------------------
        # need manual curation
        self.manual_curation["reactions"]["no ID"] = reacs_mapped[
            reacs_mapped["id"].isna()
        ]
        self._statistics["reactions"]["unmappable"] = int(
            self.manual_curation["reactions"]["no ID"]["id"].isna().sum()
        )

        # map to model reactions
        reacs_mapped = reacs_mapped[~reacs_mapped["id"].isna()]
        gpr = reacs_mapped.apply(
            lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
            axis=1,
        )
        if type(gpr) == object:
            reacs_mapped["add_to_GPR"] = gpr
        else:
            reacs_mapped["add_to_GPR"] = None

        # statistics on reactions already in model
        self._statistics["reactions"]["already in model"] = reacs_mapped[
            ~reacs_mapped["add_to_GPR"].isna()
        ]["id"].nunique()

        # statistics on mapped reactions
        self._statistics["reactions"]["mapped to MNX"] = reacs_mapped[
            (reacs_mapped["add_to_GPR"].isna()) & (reacs_mapped["via"] == "MetaNetX")
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to BiGG"] = reacs_mapped[
            (reacs_mapped["add_to_GPR"].isna()) & (reacs_mapped["via"] == "BiGG")
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to KEGG"] = reacs_mapped[
            (reacs_mapped["add_to_GPR"].isna()) & (reacs_mapped["via"] == "KEGG")
        ]["id"].nunique()

        # calculate remaining statistics
        self._statistics["reactions"]["missing (total)"] = (
            self._statistics["reactions"]["missing (based on genes)"]
            - self._statistics["reactions"]["already in model"]
        )
        self._statistics["reactions"]["missing (remaining)"] = self._statistics[
            "reactions"
        ]["unmappable"]

        # return missing_reactions
        self.missing_reactions = reacs_mapped




# ----------------------
# Gapfilling with BioCyc
# ----------------------
# @BUG stats for unmappable genes weird / wrong

[docs]
class BioCycGapFiller(GapFiller):
    """
    | Based on a SmartTable with information on the genes and a SmartTable with
    | information on the reactions of the organism of the model, this class
    | finds missing genes in the model and maps them to reactions to try and
    | fill the gaps found with the BioCyc gene SmartTable.
    |
    | For specifications on the SmartTables see the attributes `biocyc_gene_tbl`
    | & `biocyc_reacs_tbl`

    .. note::

        Please keep in mind that using this module requires a model containing the Genbank locus tags as labels.
        If your model does not conform to this you can use one of the functions
        :py:func:`~refinegems.curation.curate.polish_model` or
        :py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table`.

    Attributes:
        - GapFiller Attributes:
            All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
        - biocyc_gene_tbl_path (str, required):
            Path to organism-specific SmartTable for genes from BioCyc;
            Should contain the columns: ``Accession-2 | Reactions of gene``
        - biocyc_reacs_tbl_path (str, required):
            Path to organism-specific SmartTable for reactions from BioCyc;
            Should contain the columns:
            ``Reaction | Object ID | EC-Number | Spontaneous?``
        - gff (str, required):
            Path to organism-specific GFF file
    """

[docs]
    def __init__(
        self, biocyc_gene_tbl_path: str, biocyc_reacs_tbl_path: str, gff: str
    ) -> None:
        super().__init__()
        self._variety = "BioCyc"
        self.full_gene_list = biocyc_gene_tbl_path
        self.biocyc_rxn_tbl = biocyc_reacs_tbl_path
        self._gff = gff


    @property
    def full_gene_list(self):
        """
        | Get or set the current BioCyc Gene table.
        | While setting the provided path for a TSV file from BioCyc with the
        | columns ``'Accession-2' | 'Reactions of gene'`` is parsed and the
        | content of the file is stored in a DataFrame containing all rows where
        | a 'Reactions of gene' exists.

        .. hint::

            Please keep in mind that the column Accession-2 needs to contain Genbank locus tags. If that is not the case
            for your organism use the correct column from BioCyc and rename it accordingly.

        """
        return self._full_gene_list

    @full_gene_list.setter
    def full_gene_list(self, biocyc_gene_tbl_path: str):
        # Read table
        biocyc_genes = pd.read_table(
            biocyc_gene_tbl_path,
            usecols=["Accession-2", "Reactions of gene"],
            dtype=str,
        )

        # Rename columns for further use
        biocyc_genes.rename(
            columns={"Accession-2": "locus_tag", "Reactions of gene": "id"},
            inplace=True,
        )

        # Turn empty strings into None
        biocyc_genes.replace(r"", None, inplace=True)

        # Drop only complete empty rows
        biocyc_genes.dropna(how="all", inplace=True)

        # Add references column
        biocyc_genes["reference"] = None
        # @TODO Recheck BioCyc if references can be obtained
        # biocyc_genes = _clean_table_after_mapping(biocyc_genes, 'gene')

        # Statistics on full gene list based on BioCyc
        self._statistics['genes'] = insert_into_dict(
            self._statistics['genes'], (
                f"total (based on {self._variety})",
                biocyc_genes["locus_tag"].nunique()
            ),
            'missing (total)'
        )

        self._full_gene_list = biocyc_genes

    @property
    def biocyc_rxn_tbl(self):
        """
        | Get or set the current BioCyc Reaction table.
        | While setting the provided path for a TSV file from BioCyc with the
        | columns ``'Reaction' | 'Object ID' | 'EC-Number' | 'Spontaneous?'`` is
        | parsed and the content of the file is stored in a DataFrame.
        """
        return self._biocyc_rxn_tbl

    @biocyc_rxn_tbl.setter
    def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
        # Read table
        biocyc_rxns = pd.read_table(
            biocyc_reacs_tbl_path,
            usecols=["Reaction", "Object ID", "EC-Number", "Spontaneous?"],
            dtype=str,
        )

        # Rename columns for further use
        biocyc_rxns.rename(
            columns={
                "Reaction": "equation",
                "Object ID": "id",
                "EC-Number": "ec-code",
                "Spontaneous?": "is_spontaneous",
            },
            inplace=True,
        )

        # Turn empty strings into NaNs
        biocyc_rxns.replace(r"", None, inplace=True)

        # Set entries in is_spontaneous to booleans &
        # specify empty entries in 'is_spontaneous' as False
        biocyc_rxns["is_spontaneous"].replace({r"T": True, r"F": False}, inplace=True)
        biocyc_rxns["is_spontaneous"] = biocyc_rxns["is_spontaneous"].fillna(False)

        self._biocyc_rxn_tbl = biocyc_rxns


[docs]
    def find_missing_genes(self, model: libModel):
        """Retrieves the missing genes and reactions from the BioCyc table
        according to the 'Accession-2' identifiers (locus_tags)

        Args:
            - model (libModel):
                Model loaded with libSBML
        """

        # Step 1: get genes from model
        # ----------------------------
        geneps_in_model = [
            _.getLabel() for _ in model.getPlugin(0).getListOfGeneProducts()
        ]

        # Step 2a: Get genes of organism from BioCyc
        # ------------------------------------------
        # See self.full_gene_list

        # Step 2b: Filter out reactions without gene info
        # -----------------------------------------------
        # Save reactions without gene info
        reacs_no_gene_info = self.full_gene_list[
            self.full_gene_list["locus_tag"].isna()
        ]
        # Split reaction ids to get one per row
        reacs_no_gene_info['id'] = reacs_no_gene_info["id"].str.split(r"//")
        # Explode dataframe
        reacs_no_gene_info = reacs_no_gene_info.explode('id', ignore_index=True)
        self.manual_curation['reactions']['without gene info'] = reacs_no_gene_info

        # Add amount of reactions without gene info to statistics
        self._statistics['reactions'] = insert_into_dict(
            self._statistics['reactions'], (
                "unmappable (w/o gene info)", 
                self.manual_curation["reactions"]["without gene info"]["id"].nunique()
                ),
            'missing (based on genes)'
            )

        # Remove all rows where 'locus_tag' is None
        self.full_gene_list.dropna(subset="locus_tag", inplace=True)

        # Step 3: BioCyc vs. model genes -> get missing genes for model
        # -------------------------------------------------------------
        self.missing_genes = self.full_gene_list[
            ~self.full_gene_list["locus_tag"].isin(geneps_in_model)
        ]

        # Step 4: Get amount of missing genes in total
        # --------------------------------------------
        self._statistics["genes"]["missing (total)"] = self.missing_genes[
            "locus_tag"
        ].nunique()

        # Step 5: Filter results
        # ----------------------
        # Save not mappable genes due to no reaction ID
        self.manual_curation["genes"]["no reaction ID"] = self.missing_genes[
            self.missing_genes["id"].isna()
        ]

        # Add amount of unmappable genes to statistics
        self._statistics["genes"]["unmappable"] = self.manual_curation["genes"][
            "no reaction ID"
        ]["locus_tag"].nunique()

        # Remove all rows where 'id' is None
        self.missing_genes.dropna(subset="id", inplace=True)

        # Step 6: Get ncbiprotein IDs
        # ---------------------------
        # Parse GFF file to obtain locus_tag2ncbiportein mapping for all CDS
        locus_tag2ncbiprotein_df = parse_gff_for_cds(
            self._gff,
            {"locus_tag": "locus_tag", "protein_id": "ncbiprotein", "product": "name"},
        )
        locus_tag2ncbiprotein_df = locus_tag2ncbiprotein_df.explode("ncbiprotein")
        locus_tag2ncbiprotein_df = locus_tag2ncbiprotein_df.explode("name")

        # Get the complete missing genes dataframe with the ncbiprotein IDs
        self.missing_genes = self.missing_genes.merge(
            locus_tag2ncbiprotein_df, on="locus_tag"
        )

        # Save not mappable genes due to no ncbiprotein ID
        self.manual_curation["genes"]["no NCBI Protein ID"] = self.missing_genes[
            self.missing_genes["ncbiprotein"].isna()
        ]

        # Adjust amount of unmappable genes in statistics
        self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
            "no NCBI Protein ID"
        ]["locus_tag"].nunique()

        # Remove all rows where 'ncbiprotein' is None
        self.missing_genes.dropna(subset="ncbiprotein", inplace=True)
        
        # Step 7: Get amount of missing genes in total
        # --------------------------------------------
        self._statistics["genes"]["missing (mappable)"] = self.missing_genes[
            "locus_tag"
        ].nunique()
        self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
            "unmappable"
        ]



[docs]
    def find_missing_reactions(self, model: cobra.Model):
        """Retrieves the missing reactions with more information like the
        equation, EC code, etc. according to the missing genes

        Args:
            - model (cobra.Model):
                Model loaded with COBRApy
        """

        # Step 1: filter missing gene list + extract ECs
        # ----------------------------------------------
        # Drop locus tag column as not needed for here
        missing_genes = self.missing_genes.drop(["locus_tag", "name"], axis=1)

        # Expand missing genes result table to merge with Biocyc reactions table
        missing_genes = pd.DataFrame(
            missing_genes["id"].str.split(r"//").tolist(),
            index=missing_genes["ncbiprotein"],
        ).stack()
        missing_genes = missing_genes.reset_index([0, "ncbiprotein"])
        missing_genes.columns = ["ncbiprotein", "id"]
        missing_genes["id"] = missing_genes["id"].str.strip()

        # Turn ncbiprotein column into lists of ncbiprotein IDs per reaction
        ncbiprotein_as_list = (
            missing_genes.groupby("id")["ncbiprotein"]
            .apply(list)
            .reset_index(name="ncbiprotein")
        )
        missing_genes.drop("ncbiprotein", axis=1, inplace=True)
        missing_genes = ncbiprotein_as_list.merge(missing_genes, on="id")
        missing_genes["ncbiprotein"] = missing_genes["ncbiprotein"].apply(
            lambda x: x if not None in x else list(filter(None, x))
        )

        # Drop duplicates to get the unique BioCyc IDs
        missing_genes.drop_duplicates(subset="id", inplace=True)

        # Get missing reactions from missing genes
        self.missing_reactions = missing_genes.merge(self.biocyc_rxn_tbl, on="id")

        # Turn ec-code entries with '//' into lists, remove prefix 'EC-' & get unique ec-codes
        self.missing_reactions["ec-code"] = (
            self.missing_reactions["ec-code"]
            .str.replace(r"EC-", r"")
            .str.split(r"\s*//\s*")
        )
        self.missing_reactions["ec-code"] = (
            self.missing_reactions["ec-code"]
            .fillna({i: [] for i in self.missing_reactions.index})
            .map(set)
            .map(list)
        )

        # Add 'G_spontaneous' as gene product if marked as spontaneous &
        # drop is_spontaneous column
        self.missing_reactions["ncbiprotein"] = self.missing_reactions.apply(
            lambda x: (
                x["ncbiprotein"]
                if not x["is_spontaneous"]
                else x["ncbiprotein"].append("spontaneous")
            ),
            axis=1,
        )
        self.missing_reactions.drop("is_spontaneous", axis=1, inplace=True)

        # Step 2: Get amount of missing reactions from BioCyc for statistics
        # ------------------------------------------------------------------
        self._statistics["reactions"]["missing (based on genes)"] = (
            self.missing_reactions["id"].nunique()
        )

        # Step 3: Map BioCyc to model reactions & cleanup
        # -----------------------------------------------
        # Add column 'via'
        self.missing_reactions["via"] = "BioCyc"

        # Filter reacs for already in model
        self.missing_reactions["add_to_GPR"] = self.missing_reactions.apply(
            lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
            axis=1,
        )

        # Add column 'reference'
        self.missing_reactions["reference"] = None

        # Drop rows if 'id' is NaN
        self.manual_curation["reactions"]["no ID"] = self.missing_reactions[
            self.missing_reactions["id"].isna()
        ]

        self._statistics["reactions"]["unmappable"] = int(
            self.manual_curation["reactions"]["no ID"]["id"].isna().sum()
        )
        self.missing_reactions = self.missing_reactions[
            ~self.missing_reactions["id"].isna()
        ]

        # check, if any automatic gapfilling is possible
        if self.missing_reactions.empty:
            logger.warning(
                f"No missing reactions for the provided model {model.id} were found via {self._variety}."
            )
            return None

        # Step 4: Map missing reactions without entries in column 'add_to_GPR'
        #         to other databases to get a parsable reaction equation
        # --------------------------------------------------------------------
        # Map to MetaNetX/BiGG
        mapped_reacs = map_biocyc_to_reac(self.missing_reactions)

        # Filter reacs for already in model
        mapped_reacs["add_to_GPR"] = mapped_reacs.apply(
            lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
            axis=1,
        )

        # Step 5: Get results
        # -------------------
        # statistics on reactions already in model
        self._statistics["reactions"]["already in model"] = mapped_reacs[
            ~mapped_reacs["add_to_GPR"].isna()
        ]["id"].nunique()

        # statistics on mapped reactions
        self._statistics["reactions"]["mapped to MNX"] = mapped_reacs[
            mapped_reacs["via"] == "MetaNetX"
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to BiGG"] = mapped_reacs[
            mapped_reacs["via"] == "BiGG"
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to KEGG"] = mapped_reacs[
            mapped_reacs["via"] == "KEGG"
        ]["id"].nunique()

        # Split missing reactios based on entries in 'via' & 'add_to_GPR'
        mask = (mapped_reacs["via"] == "BioCyc") & (mapped_reacs["add_to_GPR"].isna())

        # DataFrame with unmappable BioCyc IDs & No entries in 'add_to_GPR'
        self.manual_curation["reactions"]["no mapping"] = mapped_reacs[mask]
        self._statistics["reactions"]["unmappable"] += self.manual_curation[
            "reactions"
        ]["no mapping"]["id"].nunique()

        # calculate remaining statistics
        self._statistics["reactions"]["missing (total)"] = (
            self._statistics["reactions"]["missing (based on genes)"]
            - self._statistics["reactions"]["already in model"]
            + self._statistics['reactions']['unmappable (w/o gene info)']
        )
        self._statistics["reactions"]["missing (remaining)"] = self._statistics[
            "reactions"
        ]["unmappable"] + self._statistics['reactions']['unmappable (w/o gene info)']

        # Mapped reactions
        self.missing_reactions = mapped_reacs[~mask]




# -------------------------------------
# GapFilling with GFF and DMND database
# -------------------------------------



[docs]
class GeneGapFiller(GapFiller):
    """Find gaps in the model using the GFF file of the underlying genome
    and a DMND database and optionally NCBI.

    This gap filling approach tries to identify missing genes from the GFF file
    and uses DIAMOND to run a blastp search for homologs against the DMND database

    .. note::

        Please keep in mind that using this module requires a model containing the Genbank locus tags as labels.
        If your model does not conform to this you can use one of the functions
        :py:func:`~refinegems.curation.curate.polish_model` or
        :py:func:`~refinegems.curation.curate.extend_gp_annots_via_mapping_table`.

    .. hint::

        Files required for the swissprot approach can be downloaded with
        :py:func:`~refinegems.utility.set_up.download_url`

    Attributes:
        - GapFiller Attributes:
            All attributes of the parent class :py:class:`~refinegems.classes.gapfill.GapFiller`
    """

    GFF_COLS = {
        "locus_tag": "locus_tag",
        "eC_number": "ec-code",
        "protein_id": "ncbiprotein",
    }  # :meta:


[docs]
    def __init__(self) -> None:
        super().__init__()
        self._variety = "GFF"



[docs]
    def find_missing_genes(self, gffpath: Union[str, Path], model: libModel):
        """Find missing genes by comparing the CDS regions written in the GFF
        with the GeneProduct entities in the model.

        Args:
            - gffpath (Union[str, Path]):
                Path to a GFF file (corresponding to the model).
            - model (libModel):
                The model loaded with libSBML.
        """

        # get all CDS from gff
        self.full_gene_list = parse_gff_for_cds(gffpath, self.GFF_COLS)
        # Statistics on full gene list based on GFF
        self._statistics['genes'] = insert_into_dict(
            self._statistics['genes'], (
                f"total (based on {self._variety})",
                (
                    self.full_gene_list["locus_tag"].nunique()
                    + int(self.full_gene_list["locus_tag"].isna().sum())
                    )
            ),
            'missing (total)'
        )
        
        # get all genes from model by locus tag
        model_locustags = [
            _f_gene(g.getLabel()) for g in model.getPlugin(0).getListOfGeneProducts()
        ]
        # filter
        self.missing_genes = self.full_gene_list.loc[
            ~self.full_gene_list["locus_tag"].isin(model_locustags)
        ]
        # formatting
        for col in self.GFF_COLS.values():
            if col not in self.missing_genes.columns:
                self.missing_genes[col] = None

        # collect stats
        self._statistics["genes"]["missing (total)"] = self.missing_genes[
            "locus_tag"
        ].nunique() + int(self.missing_genes["locus_tag"].isna().sum())

        # @ASK Should we not filter out all genes without locus tag here?
        # Even without ncbiprotein some entries could be without locus tag?
        # save genes with no locus tag for manual curation
        if "ncbiprotein" in self.missing_genes.columns:
            self.manual_curation["genes"]["gff no locus tag"] = self.missing_genes[
                self.missing_genes["locus_tag"].isna()
            ]["ncbiprotein"]
        else:
            self.missing_genes["ncbiprotein"] = None

        # Statistics on unmappable genes
        self._statistics["genes"]["unmappable"] = int(
            self.missing_genes["locus_tag"].isna().sum()
        )  # no locus tag

        # formatting
        # ncbiprotein | locus_tag | ec-code
        self.missing_genes = self.missing_genes[
            ~self.missing_genes["locus_tag"].isna()
        ]
        self.missing_genes = self.missing_genes.explode("ncbiprotein")




[docs]
    def find_missing_reactions(
        self,
        model: cobra.Model,
        # prefix for pseudo ncbiprotein ids
        prefix: str = "refinegems",
        type_db: Literal["swissprot", "user"] = "swissprot",
        # database information
        fasta: str = None,
        dmnd_db: str = None,
        map_db: str = None,
        # SwissProt - NCBI params
        mail: str = None,
        check_NCBI: bool = False,
        # other params
        threshold_add_reacs: int = 5,
        # further optional params for the mapping
        **kwargs,
    ) -> None:
        """Find missing reactions in the model by blasting the missing genes
        against the SwissProt database and mapping the results to EC/BRENDA.
        
        Optionally, query the protein accession numbers against NCBI
        (increases runtime significantly).

        .. note::
            When running this function with the default parameters, no mapping is performed.
            If you want to use the mapping, please provide the required parameters.
            The defaults are primarily to ensure smooth addition to workflows in pipelines.

        .. hint::

            For more information on how to get the SwissProt files, please see
            :py:func:`~refinegems.utility.set_up.download_url`.

        Args:
            - model (cobra.Model):
                The model loaded with COBRApy.
            - prefix (str, optional):
                Prefix for gene IDs in the model, the have to be generated
                randomly, as no ID from the chosen namespace (usually NCBI protein)
                has been found. Defaults to 'refinegems'.
            - mail (str, optional):
                Mail address for the query against NCBI.
                If not set, skips the mapping.
                Defaults to None.
            - check_NCBI (bool, optional):
                If set to True, checking the gene IDs / NCBI protein IDs against the
                NCBI database is enabled. Else, this step is skipped to reduce runtime.
                Only usable with SwissProt as database.
                Defaults to False.
            - fasta (str, optional):
                The protein FASTA file of the organism the model was build on.
                Required for the search against SwissProt.
                Defaults to None.
            - type_db (Literal['swissprot','user'], optional):
                Database to search against.
                Choose 'swissprot' for SwissProt or 'user' for a user defined database.
                Defaults to 'swissprot'.
            - dmnd_db (str, optional):
                Path to the DIAMOND database containing the protein sequences of SwissProt.
                Required for the search against SwissProt or the users own DIAMOND database.
                Defaults to None.
            - map_db (str, optional):
                Path to the SwissProt/users own mapping file.
                Required for the search against SwissProt/user-defined database.
                Greatly decreases runtime for running the DIAMOND search. 
                Note, that the mapping depends on the chosen database.
                Defaults to None.
            - threshold_add_reacs (int, optional):
                Threshold for the amount of reactions to add to the model.
                Defaults to 5.
            - kwargs:
                Further optional parameters for the mapping,
                e.g. outdir, sens, cov, t, pid, etc.
                For more information see :py:func:`refinegems.utility.db_access.map_to_homologs`
                in case of type_db = 'user' or :py:func:`refinegems.utility.db_access.get_ec_via_swissprot`
                in case of type_db = 'swissprot'.
        """
        self._variety = f"{self._variety} + {type_db}"

        # try to identfy missing ECs
        # --------------------------
        case_1 = self.missing_genes[self.missing_genes["ec-code"].isna()]
        not_case_1 = self.missing_genes[~self.missing_genes["ec-code"].isna()]
        if len(case_1) > 0:

            # BLAST against a dmnd to retrieve EC numbers
            # +++++++++++++++++++++++++++++++++++++++++++

            match type_db:
                # type_db = swissprot: BLAST against SwissProt
                # -> BLAST (DIAMOND) against SwissProt to get EC/BRENDA
                case "swissprot":
                    if fasta and dmnd_db and map_db:
                        case_1_mapped = get_ec_via_swissprot(
                            fasta, dmnd_db, case_1, map_db, **kwargs
                        )  # further optional params for the mapping
                        case_1.drop("ec-code", inplace=True, axis=1)
                        case_1 = case_1.merge(case_1_mapped, on="locus_tag", how="left")
                        not_case_1["uniprot"] = None
                        # still no EC but ncbiprotein
                        #       -> access ncbi for ec (optional)
                        # @DEBUG .......................
                        # case_1 = case_1.iloc[300:350,:]
                        # print(UserWarning('Running in debugging mode.'))
                        # ..............................
                        if check_NCBI and mail:
                            case_1["ec-code"] = case_1.progress_apply(
                                lambda x: (
                                    get_ec_from_ncbi(mail, x["ncbiprotein"])
                                    if not x["ec-code"]
                                    and not x["ncbiprotein"].isna()
                                    else x["ec-code"]
                                ),
                                axis=1,
                            )

                # type_db = user: BLAST against user defined database
                case "user":
                    if fasta and dmnd_db:
                        case_1 = map_to_homologs(
                            fasta, dmnd_db, case_1, map_db, email=mail, **kwargs
                        ) 
                        # Post processing
                        # ---------------
                        # Add information from self.missing_genes back into case_1
                        case_1 = case_1.merge(
                            self.missing_genes,
                            on="locus_tag",
                            how="left",
                        )
                        # Ensure that the correct column is renamed back to ncbiprotein
                        case_1.rename({'ncbiprotein_y': 'ncbiprotein', 'ec-code_x': 'ec-code'}, axis=1, inplace=True)

                        # Drop empty ec-code_y column
                        case_1.drop('ec-code_y', axis=1, inplace=True)

                case _:
                    raise ValueError(
                        "Please choose one of the valid database types 'swissprot' or 'user' for the GeneGapFiller."
                    )

        # no ncbiprotein, no EC
        self.manual_curation["genes"]["no ncbiprotein, no EC"] = case_1[
            case_1["ncbiprotein"].isna() & case_1["ec-code"].isna()
        ]
        self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
            "no ncbiprotein, no EC"
        ]["locus_tag"].nunique()

        # combine with the rest
        mapped_reacs = pd.concat(
            [
                case_1[~(case_1["ncbiprotein"].isna() & case_1["ec-code"].isna())],
                not_case_1,
            ]
        )

        # convert NaNs to None
        mapped_reacs.mask(mapped_reacs.isna(), other=None, inplace=True)

        # save entries with no EC for manual curation
        self.manual_curation["genes"]["no EC"] = mapped_reacs[
            mapped_reacs["ec-code"].isna()
        ]
        self._statistics["genes"]["unmappable"] += self.manual_curation["genes"][
            "no EC"
        ]["locus_tag"].nunique()
        mapped_reacs = mapped_reacs[~mapped_reacs["ec-code"].isna()]

        # check, if any automatic gapfilling is still possible
        if mapped_reacs.empty:
            logger.warning(
                f"No missing reactions for the provided model {model.id} were found via {self._variety}."
            )
            return None

        # create pseudoids for entries with no ncbiprotein id
        mapped_reacs["ncbiprotein"] = mapped_reacs.apply(
            lambda x: f'{x["locus_tag"]}' if not x["ncbiprotein"] else x["ncbiprotein"],
            axis=1,
        )

        # EC found
        # ---------
        # update the gene information
        updated_missing_genes = mapped_reacs.copy()
        if 'GeneID' in updated_missing_genes.columns: updated_missing_genes.rename({'GeneID': 'ncbigene'}, axis=1, inplace=True)
        updated_missing_genes = _clean_table_after_mapping(updated_missing_genes, 'gene')

        # reformat missing reacs
        if type_db == "swissprot":
            mapped_reacs.drop(["locus_tag"], inplace=True, axis=1)

        # transform table into EC-number vs. list of NCBI protein IDs
        eccode = (
            mapped_reacs["ec-code"]
            .apply(pd.Series)
            .reset_index()
            .melt(id_vars="index")
            .dropna()[["index", "value"]]
            .set_index("index")
        )
        ncbiprot = (
            mapped_reacs["ncbiprotein"]
            .apply(pd.Series)
            .reset_index()
            .melt(id_vars="index")
            .dropna()[["index", "value"]]
            .set_index("index")
        )
        mapped_reacs = pd.merge(
            eccode, ncbiprot, left_index=True, right_index=True
        ).rename(columns={"value_x": "ec-code", "value_y": "ncbiprotein"})
        mapped_reacs = (
            mapped_reacs.groupby(mapped_reacs["ec-code"])
            .aggregate({"ncbiprotein": "unique"})
            .reset_index()
        )

        # map EC to reactions
        mapped_reacs = map_ec_to_reac(
            mapped_reacs[["ec-code", "ncbiprotein"]], threshold_add_reacs
        )

        self._statistics["reactions"]["missing (based on genes)"] = mapped_reacs[
            "id"
        ].nunique()

        # save for manual curation
        self.manual_curation["reactions"]["no mapping"] = mapped_reacs[
            mapped_reacs["id"].isna()
        ]
        self._statistics["reactions"]["unmappable"] = int(
            self.manual_curation["reactions"]["no mapping"]["id"].isna().sum()
        )
        # map to model
        mapped_reacs = mapped_reacs[~mapped_reacs["id"].isna()]
        mapped_reacs["add_to_GPR"] = mapped_reacs.apply(
            lambda x: self._find_reac_in_model(model, x["ec-code"], x["id"], x["via"]),
            axis=1,
        )

        # Track all remaining statistics
        # statistics on reactions already in model
        self._statistics["reactions"]["already in model"] = mapped_reacs[
            ~mapped_reacs["add_to_GPR"].isna()
        ]["id"].nunique()

        # statistics on mapped reactions
        self._statistics["reactions"]["mapped to MNX"] = mapped_reacs[
            (mapped_reacs["add_to_GPR"].isna()) & (mapped_reacs["via"] == "MetaNetX")
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to BiGG"] = mapped_reacs[
            (mapped_reacs["add_to_GPR"].isna()) & (mapped_reacs["via"] == "BiGG")
        ]["id"].nunique()
        self._statistics["reactions"]["mapped to KEGG"] = mapped_reacs[
            (mapped_reacs["add_to_GPR"].isna()) & (mapped_reacs["via"] == "KEGG")
        ]["id"].nunique()

        # calculate remaining statistics
        self._statistics["reactions"]["missing (total)"] = (
            self._statistics["reactions"]["missing (based on genes)"]
            - self._statistics["reactions"]["already in model"]
        )
        self._statistics["reactions"]["missing (remaining)"] = self._statistics[
            "reactions"
        ]["unmappable"]
        self._statistics["genes"]["missing (mappable)"] = updated_missing_genes[
            "locus_tag"
        ].nunique()
        self._statistics["genes"]["missing (remaining)"] = self._statistics["genes"][
            "unmappable"
        ]

        # update attributes
        if type_db == "user":
            updated_missing_genes = updated_missing_genes.drop(
                columns=["locus_tag_ref", "old_locus_tag"], axis=1
            )
        self.missing_genes = updated_missing_genes
        self.missing_reactions = mapped_reacs




############################################################################
# functions (2)
############################################################################


# Gap-filling via medium/media
# ----------------------------

[docs]
def single_cobra_gapfill(
    model: cobra.Model,
    universal: cobra.Model,
    medium: Medium,
    namespace: Literal["BiGG"] = "BiGG",
    growth_threshold: float = 0.05,
) -> Union[list[str], bool]:
    """Attempt gapfilling (with COBRApy) for a given model to allow growth on a given
    medium.

    Args:
        - model (cobra.Model):
            The model to perform gapfilling on.
        - universal (cobra.Model):
            A model with reactions to be potentially used for the gapfilling.
        - medium (Medium):
            A medium the model should grow on.
        - namespace (Literal['BiGG'], optional): Namespace to use for the model.
            Defaults to 'BiGG'.
        - growth_threshold (float, optional):  Minimal rate for the model to be considered growing.
            Defaults to 0.05.

    Returns:
        Union[list[str],True]:
            List of reactions to be added to the model to allow growth
            or True, if the model already grows.
    """

    # perform the gapfilling
    solution = []
    with model as model_copy:
        # set medium model should grow on
        medium_to_model(model_copy, medium, namespace, double_o2=False)
        # if model does not show growth (depending on threshold), perform gapfilling
        if model_copy.optimize().objective_value < growth_threshold:
            try:
                solution = cobra.flux_analysis.gapfill(
                    model_copy,
                    universal,
                    lower_bound=growth_threshold,
                    demand_reactions=False,
                )
            except cobra.exceptions.Infeasible:
                warnings.warn(
                    f"Gapfilling for medium {medium.name} failed. Manual curation required."
                )
        else:
            logger.info(
                f"Model already grows on medium {medium.name} with objective value of {model_copy.optimize().objective_value}"
            )
            return True

    return solution




[docs]
def cobra_gapfill_wrapper(
    model: cobra.Model,
    universal: cobra.Model,
    medium: Medium,
    namespace: Literal["BiGG"] = "BiGG",
    growth_threshold: float = 0.05,
    iterations: int = 3,
    chunk_size: int = 10000,
) -> cobra.Model:
    """Wrapper for :py:func:`~refinegems.classes.gapfill.single_cobra_gapfill`.

    Either use the full set of reactions in universal model by setting iteration to
    0 or None or use them in randomized chunks for faster runtime (useful
    on laptops). Note: when using the second option, be aware that this does not
    test all reaction combinations exhaustively (heuristic approach!!!).

    Args:
        - model (cobra.Model):
            The model to perform gapfilling on.
        - universal (cobra.Model):
            A model with reactions to be potentially used for the gapfilling.
        - medium (Medium):
            A medium the model should grow on.
        - namespace (Literal['BiGG'], optional):
            Namespace to use for the model.
            Options include 'BiGG'.
            Defaults to 'BiGG'.
        - growth_threshold (float, optional):
            Growth threshold for the gapfilling.
            Defaults to 0.05.
        - iterations (int, optional):
            Number of iterations for the heuristic version of the gapfilling.
            If 0 or None is given, uses full set of reactions.
            Defaults to 3.
        - chunk_size (int, optional):
            Number of reactions to be used for gapfilling at the same time.
            If None or 0 is given, use full set, not heuristic.
            Defaults to 10000.

    Returns:
        cobra.Model:
            The gapfilled model, if a solution was found.
    """

    solution = []

    # run a heuristic approach:
    #     for a given number of iterations, use a subset (chunk_size) of
    #     reactions for the gapfilling
    if (iterations and iterations > 0) and (chunk_size and chunk_size > 0):

        num_reac = len(model.reactions)
        # for each iteration
        for i in range(iterations):
            not_used = [_ for _ in range(0, num_reac)]

            # divide reactions in random subsets
            for chunk in range(math.ceil(num_reac / chunk_size)):
                if len(not_used) > chunk_size:
                    rng = np.random.default_rng()
                    reac_set = rng.choice(
                        not_used, size=chunk_size, replace=False, shuffle=False
                    )
                    not_used = [_ for _ in not_used if _ not in reac_set]
                else:
                    reac_set = not_used

                # get subset of reactions
                subset_reac = cobra.Model("subset_reac")
                for n in reac_set:
                    subset_reac.add_reactions([universal.reactions[n].copy()])

                # gapfilling
                solution = single_cobra_gapfill(
                    model, subset_reac, medium, namespace, growth_threshold
                )

                if (isinstance(solution, bool) and solution) or (
                    isinstance(solution, list) and len(solution) > 0
                ):
                    break

            if (isinstance(solution, bool) and solution) or (
                isinstance(solution, list) and len(solution) > 0
            ):
                break

    # use the whole reactions content of the universal model at once
    #     not advised for Laptops and PCs with small computing power
    #     may take a long time
    else:
        solution = single_cobra_gapfill(
            model, universal, medium, namespace, growth_threshold
        )

    # if solution is found add new reactions to model
    if isinstance(solution, list) and len(solution) > 0:
        for reac in solution[0]:
            reac.notes["creation"] = "via gapfilling"
        logger.info(
            f"Adding {len(solution[0])} reactions to model to ensure growth on medium {medium.name}."
        )
        model.add_reactions(solution[0])

    return model




[docs]
def multiple_cobra_gapfill(
    model: cobra.Model,
    universal: cobra.Model,
    media_list: list[Medium],
    namespace: Literal["BiGG"] = "BiGG",
    growth_threshold: float = 0.05,
    iterations: int = 3,
    chunk_size: int = 10000,
) -> cobra.Model:
    """Perform :py:func:`~refinegems.classes.gapfill.cleanup.single_cobra_gapfill` on a list of media.

    Args:
        - model (cobra.Model):
            The model to be gapfilled.
        - universal (cobra.Model):
            The model to use reactions for gapfilling from.
        - media_list (list[Medium]):
            List ofmedia the model is supposed to grow on.
        - growth_threshold (float, optional):
            Growth threshold for the gapfilling.
            Defaults to 0.05.
        - iterations (int, optional):
            Number of iterations for the heuristic version of the gapfilling.
            If 0 or None is given, uses full set of reactions.
            Defaults to 3.
        - chunk_size (int, optional):
            Number of reactions to be used for gapfilling at the same time.
            If None or 0 is given, use full set, not heuristic.
            Defaults to 10000.
    Returns:
        cobra.Model:
            The gapfilled model, if a solution was found.
    """

    for medium in media_list:
        model = cobra_gapfill_wrapper(
            model,
            universal,
            medium,
            namespace,
            iterations,
            chunk_size,
            growth_threshold,
        )

    return model