Source code for refinegems.utility.databases

#!/usr/bin/env python
"""Variables, functions and more for the developement, extension and maintainance of
the in-build database.

.. note::

   Some functionalities for handling and dealing with the media are in the
   :py:mod:`~refinegems.classes.medium` module.

.. hint::

   Further functions for accessing the database can be found in the :py:mod:`~refinegems.utility.io`
   module, e.g. :py:func:`~refinegems.utility.io.load_a_table_from_database`
"""

__author__ = "Gwendolyn O. Döbel und Carolin Brune"

################################################################################
# requirements
################################################################################

import io
import re
import sqlite3
import requests
import logging
import pandas as pd

from enum import Enum
from sqlite3 import Error
from tqdm import tqdm
from pathlib import Path
from importlib.resources import files
from typing import Union

################################################################################
# variables
################################################################################

PATH_TO_DB_FOLDER = files("refinegems.data.database")  #: :meta hide-value:
PATH_TO_DB = PATH_TO_DB_FOLDER.joinpath("data.db")  #: :meta hide-value:
VERSION_FILE = PATH_TO_DB_FOLDER.joinpath(
    "current_bigg_db_version.txt"
)  #: :meta hide-value:
VERSION_URL = "http://bigg.ucsd.edu/api/v2/database_version"  #: :meta:

mnx_db_namespace = {
    "reac_prop": (
        "https://www.metanetx.org/cgi-bin/mnxget/mnxref/reac_prop.tsv",
        ["id", "mnx_equation", "reference", "ec-code", "is_balanced", "is_transport"],
    ),
    "reac_xref": (
        "https://www.metanetx.org/cgi-bin/mnxget/mnxref/reac_xref.tsv",
        ["source", "id", "description"],
    ),
    "chem_prop": (
        "https://www.metanetx.org/cgi-bin/mnxget/mnxref/chem_prop.tsv",
        [
            "id",
            "name",
            "reference",
            "formula",
            "charge",
            "mass",
            "InChI",
            "InChIKey",
            "SMILES",
        ],
    ),
    "chem_xref": (
        "https://www.metanetx.org/cgi-bin/mnxget/mnxref/chem_xref.tsv",
        ["source", "id", "description"],
    ),
}

################################################################################
# functions
################################################################################


[docs] class ValidationCodes(Enum): """Validation codes for the database Args: - Enum (Enum): Provided as input to get a number mapping for the codes """ COMPLETE = (0,) # All tables are in data.db EMPTY = (1,) # data.db is either empty or incorrect BIGG = (2,) # Only BiGG tables are in data.db MEDIA = (3,) # Only Media tables are in data.db BIGG_MEDIA = (4,) # Only BiGG and media tables are in data.db MODELSEED_COMPOUNDS = (5,) # Only ModelSEED compounds table is in data.db BIGG_MSEED_COMPOUNDS = ( 6, ) # Only Bigg and ModelSEED compounds tables are in data.db MEDIA_MSEED_COMPOUNDS = ( 7 # Only media and ModelSEED compounds tables are in data.db # )
validation_messages = { ValidationCodes.COMPLETE: "All tables in data up-to-date. Initialisation complete.", ValidationCodes.EMPTY: "No table in data. An error must have occurred during initialisation.", ValidationCodes.BIGG: "Data only contains the BiGG tables. Please check the remaining tables.", ValidationCodes.MEDIA: "Data only contains the media tables. Please check the BiGG and ModelSEED tables.", ValidationCodes.BIGG_MEDIA: "Data only contains the BiGG and media tables. Please check the ModelSEED table.", ValidationCodes.MODELSEED_COMPOUNDS: "Data only contains the ModelSEED table. Please check the BiGG and media tables.", ValidationCodes.BIGG_MSEED_COMPOUNDS: "Data only contains the BiGG and ModelSEED tables. Please check the media tables.", ValidationCodes.MEDIA_MSEED_COMPOUNDS: "Data only contains the media and ModelSEED tables. Please check the BiGG tables.", }
[docs] def is_valid_database(db_cursor: sqlite3.Cursor) -> int: """Verifies if database has: - 2 tables with names 'bigg_metabolites' & 'bigg_reactions' - 6 tables with names 'medium', 'substance', 'substance2db' & 'medium2substance', 'subset' & 'subset2substance' - 1 table with name 'modelseed_compounds' Args: - db_cursor (sqlite3.Cursor): Cursor from open connection to the database (data.db) Returns: int: Corresponding to one of the ValidationCodes """ print("Verifying database...") # Fetches the table names as string tuples from the connected database db_cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") tables = [string[0] for string in db_cursor.fetchall()] bigg_tables_contained = ( len([s for s in tables if re.match(r"^bigg_(?!to)(.*?)", s, re.IGNORECASE)]) == 2 ) media_tables_contained = ( len( [ s for s in tables if re.match( r"^medium(.*?)|^substance(.*?)|^subset(.*?)", s, re.IGNORECASE ) ] ) == 6 ) modelseed_cmpd_tbl_contained = ( len([s for s in tables if s == "modelseed_compounds"]) == 1 ) bigg_media_tbls_contained = bigg_tables_contained and media_tables_contained bigg_modelseed_cmpd_tbls_contained = ( bigg_tables_contained and modelseed_cmpd_tbl_contained ) media_modelseed_cmpd_tbls_contained = ( media_tables_contained and modelseed_cmpd_tbl_contained ) all_tables_contained = ( bigg_media_tbls_contained and modelseed_cmpd_tbl_contained ) if all_tables_contained: return ValidationCodes.COMPLETE elif bigg_modelseed_cmpd_tbls_contained: return ValidationCodes.BIGG_MSEED_COMPOUNDS elif media_modelseed_cmpd_tbls_contained: return ValidationCodes.MEDIA_MSEED_COMPOUNDS elif bigg_media_tbls_contained: return ValidationCodes.BIGG_MEDIA elif bigg_tables_contained: return ValidationCodes.BIGG elif media_tables_contained: return ValidationCodes.MEDIA elif modelseed_cmpd_tbl_contained: return ValidationCodes.MODELSEED_COMPOUNDS else: return ValidationCodes.EMPTY
[docs] def create_media_database(db_cursor: sqlite3.Cursor): """Creates the media database with 4 tables ('medium', 'substance', 'substance2db', 'medium2substance') from file './data/database/media_db.sql' Args: - db_cursor (sqlite3.Cursor): Cursor from open connection to the database (data.db) """ db_cursor.executescript( """ DROP TABLE IF EXISTS susbtance; DROP TABLE IF EXISTS substance2db; DROP TABLE IF EXISTS medium; DROP TABLE IF EXISTS medium2substance; """ ) print("Adding media tables...") with open(files("refinegems.data.database").joinpath("media_db.sql")) as schema: db_cursor.executescript(schema.read())
[docs] def update_bigg_db(latest_version: str, db_connection: sqlite3.Connection) -> dict: """Updates the BiGG tables 'bigg_metabolites' & 'bigg_reactions' within a database (data.db) Args: - latest_version (str): String containing the Path to a file with the latest version of the BiGG database - db_connection (sqlite3.Connection): Open connection to the database (data.db) """ def get_database_links_info_per_row(row: pd.Series): """For a single row of the dataframe, extract the database identifier from the collection of database links. Args: - ow (pd.Series): The database links for a single row of the dataframe. Returns: dict: A dictionary with the databses as keys and the IDs as values. """ database_ids = {} if isinstance(row, str): links = row.split(";") for link in links: key, value = link.split(":", 1) key = key.strip() value = value.rsplit("/", 1)[1].strip() value = re.sub(r"^(?i)meta:", "", value) if key in database_ids.keys(): database_ids[key].append(value) else: database_ids[key] = [value] for k in database_ids.keys(): database_ids[k] = ", ".join(database_ids[k]) return database_ids def get_database_links(data: pd.DataFrame) -> pd.DataFrame: """For the dataframe, extract the database IDs from the database_links column into separate column containing the IDs and not the links. Args: - data (pd.DataFrame): The input dataframe. Returns: pd.DataFrame: The edited dataframe """ data = data.join( pd.DataFrame( [get_database_links_info_per_row(row) for row in data["database_links"]] ) ) data.drop(columns=["database_links", "model_list"], axis=1, inplace=True) return data print("Adding BiGG tables...") # Store currently used version with open(VERSION_FILE, "w") as file: file.write(latest_version) # Create BiGG metabolites table BIGG_MODELS_METABS_URL = ( "http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt" ) bigg_models_metabs = requests.get(BIGG_MODELS_METABS_URL).text bigg_models_metabs_df = pd.read_csv( io.StringIO(bigg_models_metabs), dtype=str, sep="\t" ) bigg_models_metabs_df.rename(columns={"bigg_id": "id"}, inplace=True) bigg_models_metabs_df = get_database_links(bigg_models_metabs_df) bigg_id_duplicates = bigg_models_metabs_df.duplicated(subset=["id"], keep=False) bigg_id_duplicates_df = bigg_models_metabs_df[bigg_id_duplicates] bigg_id_duplicates_set = set(bigg_id_duplicates_df["id"].tolist()) bigg_models_metabs_df[~bigg_id_duplicates].to_sql( "bigg_metabolites", db_connection, if_exists="replace", index=False, dtype={"id": "TEXT PRIMARY KEY"}, ) if bigg_id_duplicates_set: logging.warning( "The BiGG metabolite table contains the following " f"{len(bigg_id_duplicates_set)} duplicate(s):\n" f"{bigg_id_duplicates_set}\n" "Duplicate(s) are completely removed from the table." ) # Create BiGG reactions table BIGG_MODELS_REACS_URL = ( "http://bigg.ucsd.edu/static/namespace/bigg_models_reactions.txt" ) bigg_models_reacs = requests.get(BIGG_MODELS_REACS_URL).text bigg_models_reacs_df = pd.read_csv( io.StringIO(bigg_models_reacs), dtype=str, sep="\t" ) bigg_models_reacs_df.rename(columns={"bigg_id": "id"}, inplace=True) bigg_models_reacs_df = get_database_links(bigg_models_reacs_df) bigg_models_reacs_df.to_sql( "bigg_reactions", db_connection, if_exists="replace", index=False, dtype={"id": "TEXT PRIMARY KEY"}, )
[docs] def get_latest_bigg_databases( db_connection: sqlite3.Connection, is_missing: bool = True ): """Gets the latest BiGG tables for metabolites & reactions if: - No version file is locally available - The version in the local version file is NOT the latest - No BiGG tables currently exist in the database Args: - db_connection (sqlite3.Connection): Open connection to the database (data.db) - is_missing (bool, optional): True if no BiGG tables are in the database. Defaults to True. """ # Check if BiGG database had an update LATEST_VERSION = requests.get(VERSION_URL).json()["bigg_models_version"] if not Path.exists(VERSION_FILE) or is_missing: update_bigg_db(LATEST_VERSION, db_connection) else: with open(VERSION_FILE, "r") as file: version = file.readline().strip() if version != LATEST_VERSION: update_bigg_db(LATEST_VERSION, db_connection)
[docs] def get_modelseed_compounds_database(db_connection: sqlite3.Connection): """Retrieves the compounds table from ModelSEED from the respective GitHub repository Args: - db_connection (sqlite3.Connection): Open connection to the database (data.db) """ print("Adding the ModelSEED compounds table...") MODELSEED_COMPOUNDS_URL = "https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/compounds.tsv" modelseed_compounds = requests.get(MODELSEED_COMPOUNDS_URL).text modelseed_df = pd.read_csv( io.StringIO(modelseed_compounds), sep="\t", dtype={"linked_compound": str} ) #'mass': float, modelseed_df.to_sql( "modelseed_compounds", db_connection, if_exists="replace", index=False, dtype={"id": "TEXT PRIMARY KEY", "mass": "NUMERIC", "linked_compound": "TEXT"}, )
[docs] def initialise_database(): """Initialises/updates the database (data.db) After initialisation the database contains: - 2 tables with names 'bigg_metabolites' & 'bigg_reactions' - 6 tables with names 'medium', 'substance', 'medium2substance', 'substance2db', 'subset' & 'subset2substance' - 1 table with name 'modelseed_compounds' """ # Initialise empty connection con = None print("Initialising database...") # Try to open connection & get cursor try: con = sqlite3.connect(PATH_TO_DB) cursor = con.cursor() validity_code = is_valid_database(cursor) if validity_code == ValidationCodes.BIGG: print("Only BiGG tables contained in database.") create_media_database(cursor) get_modelseed_compounds_database(con) elif validity_code == ValidationCodes.MEDIA: print("Only media tables contained in database.") get_latest_bigg_databases(con) get_modelseed_compounds_database(con) elif validity_code == ValidationCodes.MODELSEED_COMPOUNDS: print("Only ModelSEED compounds table contained in database.") create_media_database(cursor) get_latest_bigg_databases(con) elif validity_code == ValidationCodes.BIGG_MEDIA: print("Only BiGG and media tables contained in database.") get_modelseed_compounds_database(con) elif validity_code == ValidationCodes.BIGG_MSEED_COMPOUNDS: print("Only BiGG and ModelSEED compounds tables contained in database.") create_media_database(cursor) elif validity_code == ValidationCodes.MEDIA_MSEED_COMPOUNDS: print( "Only media and ModelSEED compounds tables contained in database." ) get_latest_bigg_databases(con) elif validity_code == ValidationCodes.EMPTY: print( "Incorrect or empty database. Initialise database with required tables..." ) create_media_database(cursor) get_latest_bigg_databases(con) get_modelseed_compounds_database(con) elif validity_code == ValidationCodes.COMPLETE: print("Verifying if BiGG tables are up-to-date...") get_latest_bigg_databases(con, False) except Error as e: print(e) finally: if con: con.close() # Validate initialised database con = sqlite3.connect(PATH_TO_DB) cursor = con.cursor() validity_code = is_valid_database(cursor) print(validation_messages.get(validity_code)) con.close()
# about MetaNetX Information # -------------------------- # DISCLAIMER: # Database information from MetaNetX # distributed under https://creativecommons.org/licenses/by/4.0/ # Citation: MetaNetX/MNXref: unified namespace for metabolites and biochemical reactions in the context of metabolic models # Sébastien Moretti, Van Du T Tran, Florence Mehl, Mark Ibberson, Marco Pagni # Nucleic Acids Research (2021), 49(D1):D570-D574
[docs] def update_mnx_namespaces(db: Union[Path, str] = PATH_TO_DB, chunksize: int = 1): """Add or update the MetaNetX namespace to/in a database. Args: - db (Union[Path,str],optional): Path to a database to add the namespace to. Defaults to the in-build database. - chunksize (int, optional): Size of the chunk (in kB) to download at once. Defaults to 1. """ con = sqlite3.connect(db) for name, values in mnx_db_namespace.items(): link, colnames = values mnx_table = [] for chunk in tqdm( pd.read_csv( link, sep="\t", comment="#", names=colnames, chunksize=chunksize * 1024 ), desc=f"Downloading {name}", unit="B", ): mnx_table.append(chunk) match name: # Reaction property table case "reac_prop": total_len = sum([len(_) for _ in mnx_table]) with tqdm(total=total_len, unit="entries", desc="Add to DB") as pbar: for i, chunk in enumerate(mnx_table): if i == 0: exists = "replace" else: exists = "append" chunk.to_sql( "mnx_" + name, con, if_exists=exists, index=False, dtype={"id": "TEXT PRIMARY KEY"}, ) pbar.update(len(chunk)) # Reaction cross-reference table case "reac_xref": cursor = con.cursor() cursor.execute("DROP TABLE IF EXISTS mnx_reac_xref") empty_table = """ CREATE TABLE mnx_reac_xref ( source TEXT, id TEXT, description TEXT, CONSTRAINT PK_mnx_reac_xref PRIMARY KEY (source,id) FOREIGN KEY(id) REFERENCES mnx_reac_prop(id) ); """ cursor.execute(empty_table) total_len = sum([len(_) for _ in mnx_table]) with tqdm(total=total_len, unit="entries", desc="Add to DB") as pbar: for i, chunk in enumerate(mnx_table): chunk.to_sql( "mnx_" + name, con, if_exists="append", index=False ) pbar.update(len(chunk)) # Metabolite properties table case "chem_prop": total_len = sum([len(_) for _ in mnx_table]) with tqdm(total=total_len, unit="entries", desc="Add to DB") as pbar: for i, chunk in enumerate(mnx_table): if i == 0: exists = "replace" else: exists = "append" chunk.to_sql( "mnx_" + name, con, if_exists=exists, index=False, dtype={"id": "TEXT PRIMARY KEY"}, ) pbar.update(len(chunk)) # Metabolite reference table case "chem_xref": total_len = sum([len(_) for _ in mnx_table]) cursor = con.cursor() cursor.execute("DROP TABLE IF EXISTS mnx_chem_xref") empty_table = """ CREATE TABLE mnx_chem_xref ( source TEXT, id TEXT, description TEXT, CONSTRAINT PK_mnx_chem_xref PRIMARY KEY (source,id) FOREIGN KEY(id) REFERENCES mnx_chem_prop(id) ); """ cursor.execute(empty_table) with tqdm(total=total_len, unit="entries", desc="Add to DB") as pbar: for i, chunk in enumerate(mnx_table): chunk.to_sql( "mnx_" + name, con, if_exists="append", index=False ) pbar.update(len(chunk)) con.close()
[docs] def reset_database(database: Union[Path, str] = PATH_TO_DB): """Remove tables for certain databases to allow pushing of the database to GitHub (reduce size). Args: - database (Path | str, optional): Path to the database. Defaults to PATH_TO_DB, the in-build database. """ # establish connection con = sqlite3.connect(database) cursor = con.cursor() # remove MetaNetX tables cursor.execute("DROP TABLE IF EXISTS mnx_chem_xref") cursor.execute("DROP TABLE IF EXISTS mnx_reac_xref") cursor.execute("DROP TABLE IF EXISTS mnx_chem_prop") cursor.execute("DROP TABLE IF EXISTS mnx_reac_prop") # close connection con.close()