Source code for refinegems.utility.set_up

"""Collection of functions for setting up files and work environments."""

__author__ = "Carolin Brune"

################################################################################
# requirements
################################################################################

import requests
import subprocess
import warnings

from importlib.resources import files
from pathlib import Path
from tqdm import tqdm
from typing import Literal

################################################################################
# variables
################################################################################

PATH_MEDIA_CONFIG = files("refinegems.data.config").joinpath(
    "media_config.yml"
)  #: :meta hide-value:

################################################################################
# functions
################################################################################

# --------------------------
# download databases / files
# --------------------------


[docs] def download_url( download_type: Literal["SwissProt gapfill"], directory: str = None, k: int = 10, t: int = 1, ): """Download files necessary for certain functionalities of the toolbox from the internet. Currently available: - 'SwissProt gapfill': download files needed for the :py:class:`~refinegems.classes.gapfill.GeneGapFiller` Args: - dowload_type (Literal['SwissProt gapfill']): Type of files to download. - directory (str, optional): Path to a directory to save the downloaded files to. Defaults to None (So the current working directory is used). - k (int, optional): Chunksize in kB. Defaults to 10. - t (int, optional): Number of threads to use for some additional setups, e.g. DIAMOND database creation. Defaults to 1. Raises: - ValueError: Unknown database or file """ # Ensure that directory is not None if directory is None: directory = Path.cwd() # match URLS to type of database, that the user wants to download match download_type: case "SwissProt gapfill": swissprot_api = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" swissprot_mapping_api = "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cxref_brenda%2Cec&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" urls = { "SwissProt.fasta": swissprot_api, "SwissProt_mapping.tsv": swissprot_mapping_api, } # 1.: TSV with UniprotID, BRENDA and EC -7.7MB (26.07.2024) # 2.: FASTA with sequences ~280MB (26.07.2024) case _: mes = f"Unknown database or file: {download_type}" raise ValueError(mes) # download each file for name, url in urls.items(): r = requests.get(url, stream=True) filename = Path(directory, name) # Check if Content-Length is available total_length = r.headers.get("Content-Length") # Make the progress bar if total_length is None: # Content-Length is missing, so we download without a progress bar with open(filename, "wb") as f: for chunk in r.iter_content(chunk_size=k * 1024): if chunk: f.write(chunk) else: total_length = int(total_length) with open(filename, "wb") as f: pbar = tqdm( desc=f"Downloading {name}", unit="B", unit_scale=True, unit_divisor=1024, total=total_length, ) pbar.clear() for chunk in r.iter_content(chunk_size=k * 1024): if chunk: # Filter out keep-alive new chunks pbar.update(len(chunk)) # Update progress bar f.write(chunk) pbar.close() # additional setups match download_type: # SwissProt gapfill case "SwissProt gapfill": # Create db folder for DIAMONd if non-existent try: Path(directory, "db").mkdir(parents=True, exist_ok=False) print("Creating new directory " + str(Path(directory, "db"))) except FileExistsError: warnings.warn( "Given directory already exists. High possibility of files being overwritten." ) Path(directory, "db").mkdir(parents=True, exist_ok=True) # create DIAMOND database print(f"create DIAMOND database for {download_type} using:") print( f'diamond makedb --in {Path(directory, "SwissProt.fasta")} --db {str(Path(directory,"db","SwissProt.dmnd"))} --threads {int(t)}' ) subprocess.run( [ "diamond", "makedb", "--in", str(Path(directory, "SwissProt.fasta")), "--db", str(Path(directory, "db", "SwissProt.dmnd")), "--threads", str(t), ] ) # Type for which no extra setup is needed case _: pass
# --------------------- # handling config files # ---------------------
[docs] def download_config(filename: str = "./my_config.yaml", type=Literal["media"]): """Load a configuration file from the package and save a copy of it for the user to edit. Args: - filename (str, optional): Filename to write the config to/save it under as. Defaults to './my_config.yaml'. - type (Literal['media'], optional): Type of configuration file to load. Can be 'media' for the media config file. Defaults to Literal['media']. """ def copy_config_yaml(infile: str, outfile: str): """Helper function for :py:func:`download_config`. Performs the actual download of a yaml file into a copy for the user to edit. Args: - infile (str): Path to the file to copy. - outfile (str): Path to write the copy to. """ with open(infile, "r") as cfg_file, open(outfile, "w") as cfg_out: for line in cfg_file: cfg_out.write(line) # copy an examplary version of the config file for the user to edit it match type: # copy media config case "media": copy_config_yaml(PATH_MEDIA_CONFIG, filename) # type not found case _: raise ValueError(f"Unknown type of config file detected: {type}")