Source code for vireadb.vireadb

#! /usr/bin/env python3
'''
ViReaDB: Viral Read Database
'''

# imports
from .common import *
from .compress import *
from .cram import *
from .fasta import *
from os import remove
from os.path import isdir, isfile
from shutil import copyfile
from subprocess import call, check_output, DEVNULL, PIPE, Popen
from sys import argv
from tempfile import NamedTemporaryFile
from warnings import warn
import argparse
import sqlite3

# constants
META_TABLE_COLS = ('key', 'val')
SEQS_TABLE_COLS = ('ID', 'CRAM', 'POS_COUNTS_XZ', 'INS_COUNTS_XZ', 'CONSENSUS_XZ')

# base commands
BASE_COMMAND_SAMTOOLS_VIEW_CRAM = [
    'samtools', 'view',
    '--output-fmt-option', 'version=3.0', # TODO update to 3.1 when stable
    '--output-fmt-option', 'use_lzma=1',
    '--output-fmt-option', 'archive=1',
    '--output-fmt-option', 'level=9',
    '-C', # CRAM output
]
BASE_COMMAND_MINIMAP2 = [
    'minimap2',
    '-x', 'sr',
]

[docs]class ViReaDB: '''``ViReaDB`` database class''' def __init__(self, db_fn, bufsize=DEFAULT_BUFSIZE): '''``ViReaDB`` constructor Args: ``db_fn`` (``str``): The filename of the SQLite3 database file representing this database ``bufsize`` (``int``): Buffer size for reading from file Returns: ``ViReaDB`` object ''' self.con = sqlite3.connect(db_fn) self.cur = self.con.cursor() self.version = self.cur.execute("SELECT val FROM meta WHERE key='VERSION' LIMIT 1").fetchone()[0] self.ref_name = self.cur.execute("SELECT val FROM meta WHERE key='REF_NAME' LIMIT 1").fetchone()[0] ref_seq_xz = self.cur.execute("SELECT val FROM meta WHERE key='REF_SEQ_XZ' LIMIT 1").fetchone()[0] self.ref_seq = decompress_str(ref_seq_xz) self.ref_len = len(self.ref_seq) self.ref_f = NamedTemporaryFile('w', prefix='vireadb', suffix='.fas', buffering=bufsize) self.ref_f.write('%s\n%s\n' % (self.ref_name, self.ref_seq)); self.ref_f.flush() mmi_data = self.cur.execute("SELECT val FROM meta WHERE key='REF_MMI' LIMIT 1").fetchone()[0] if mmi_data is None: # try to index ref genome if user has now installed Minimap2 mmi_data = index_ref_genome(self.ref_f.name) if mmi_data is not None: self.cur.execute("INSERT INTO meta VALUES(?, ?)", ('REF_MMI', mmi_data)) if mmi_data is None: self.mmi_f = None else: self.mmi_f = NamedTemporaryFile('wb', prefix='vireadb', suffix='.mmi', buffering=bufsize) self.mmi_f.write(mmi_data); self.mmi_f.flush() def __del__(self): '''``ViReaDB`` destructor''' self.con.close() def __len__(self): '''Return the number of entries in this database Returns: The number of entries in this database ''' return self.cur.execute("SELECT COUNT(*) FROM seqs").fetchone()[0] def __getitem__(self, ID): '''Shorthand for get_entry(ID)''' return self.get_entry(ID) def __delitem__(self, ID): '''Shorthand for del_entry(ID)''' self.del_entry(ID) def __contains__(self, ID): '''Check if ID exists in this database Args: ``ID`` (``str``): The ID to check Returns: ``True`` if ``ID`` exists, otherwise ``False`` ''' return self.cur.execute("SELECT COUNT(*) FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone()[0] != 0 def __iter__(self): return iter(self.cur.execute("SELECT * FROM seqs").fetchall())
[docs] def commit(self): '''Commit the SQLite3 database''' self.con.commit()
[docs] def vacuum(self): '''Rebuild the database file, repacking it into the minimal amount of disk space''' tmp = self.con.isolation_level self.con.isolation_level = None # workaround for bug in Python 3.6.0's sqlite3 module (https://github.com/ghaering/pysqlite/issues/109#issuecomment-277506491) self.cur.execute('VACUUM') self.con.isolation_level = tmp
[docs] def get_meta(self): '''Get the metadata from this ``ViReaDB`` database Returns: ``dict`` object containing the metadata of this ``ViReaDB`` database ''' return {'VERSION':self.version, 'REF_NAME':self.ref_name, 'REF_SEQ': self.ref_seq}
[docs] def add_entry(self, ID, reads_fn, filetype=None, lossy_names=True, include_unmapped=False, check_unique=True, bufsize=DEFAULT_BUFSIZE, threads=DEFAULT_THREADS, commit=True, verbose=False): '''Add a CRAM/BAM/SAM/FASTQ entry to this database. CRAM inputs are added exactly as-is. Args: ``ID`` (``str``): The unique ID of the entry to add ``reads_fn`` (``str``): The input reads file. Can provide list of multiple files if FASTQ ``filetype`` (``str``): The format of the input reads file (CRAM, BAM, SAM, or FASTQ), or None to infer from ``reads_fn`` ``lossy_names`` (``bool``): ``True`` to discard read names when both reads of a read-pair are in the same CRAM slicel (results in better compression), otherwise ``False`` to keep all read names ``include_unmapped`` (``bool``): Include unmapped reads when converting from non-CRAM formats ``check_unique`` (``bool``): Check that ``ID`` doesn't already exist. Should only be skipped if user is already guaranteed to not have duplicates ``bufsize`` (``int``): Buffer size for reading from file ``threads`` (``int``): Number of threads to use for compression ``commit`` (``bool``): Commit database after adding this entry ``verbose`` (``bool``): ``True`` to enable verbose messages (e.g. samtools and minimap2 commands), otherwise ``False`` ''' # check for validity if check_unique and ID in self: raise ValueError("ID already exists in database: %s" % ID) if isinstance(reads_fn, list): if len(reads_fn) == 0: raise ValueError("Must specify at least 1 reads file") elif len(reads_fn) == 1: reads_fn = reads_fn[0] if isinstance(reads_fn, str) and not isfile(reads_fn): raise ValueError("File not found: %s" % reads_fn) if filetype is None: if isinstance(reads_fn, str): filetype = reads_fn.upper().rstrip('.GZ').split('.')[-1] else: for fn in reads_fn: if filetype is None: filetype = fn.upper().rstrip('.GZ').split('.')[-1] elif filetype != fn.upper().rstrip('.GZ').split('.')[-1]: raise ValueError("All reads_fn arguments must be the same filetype") if not isinstance(filetype, str): raise TypeError("Invalid filetype: %s (must be CRAM, BAM, SAM, or FASTQ)" % filetype) filetype = filetype.strip().upper() if not isinstance(reads_fn, str) and filetype != 'FASTQ': raise ValueError("Can only provide multiple reads files for FASTQ") if not isinstance(threads, int) or threads < 1: raise ValueError("Invalid number of threads: %s" % threads) # prep samtools and minimap2 commands command_samtools_view_cram = BASE_COMMAND_SAMTOOLS_VIEW_CRAM + ['-T', self.ref_f.name, '-@', str(threads)] if lossy_names: command_samtools_view_cram += ['--output-fmt-option', 'lossy_names=1'] if not include_unmapped: command_samtools_view_cram += ['-F', '4'] # only include mapped reads if self.mmi_f is None: command_minimap2 = None else: command_minimap2 = BASE_COMMAND_MINIMAP2 + ['-a', self.mmi_f.name, '-t', str(threads)] # handle CRAM (just read all data) if filetype == 'CRAM': f = open(reads_fn, 'rb', buffering=bufsize); cram_data = f.read(); f.close() # handle BAM/SAM (convert to CRAM) elif filetype == 'BAM' or filetype == 'SAM': try: command_samtools_view_cram += [reads_fn] if verbose: print("Command: %s" % ' '.join(command_samtools_view_cram)) cram_data = check_output(command_samtools_view_cram) except FileNotFoundError: raise RuntimeError("samtools not found in PATH, so BAM/SAM input is not supported") # handle FASTQ (map to ref + convert to CRAM) elif filetype == 'FASTQ': if self.mmi_f is None: raise RuntimeError("Database does not have minimap2 index, so FASTQ input is not supported") elif isinstance(reads_fn, str): command_minimap2 += [reads_fn] else: command_minimap2 += reads_fn if verbose: print("Command: %s | %s" % (' '.join(command_minimap2), ' '.join(command_samtools_view_cram))) p_minimap2 = Popen(command_minimap2, stdout=PIPE, stderr=DEVNULL) try: cram_data = check_output(command_samtools_view_cram, stdin=p_minimap2.stdout) except: raise RuntimeError("samtools not found in PATH, so FASTQ input is not supported") p_minimap2.wait() # invalid filetype else: raise TypeError("Invalid filetype: %s (must be CRAM, BAM, or SAM)" % filetype) # add this CRAM to the database curr_row = (ID, cram_data, None, None, None) self.cur.execute("INSERT INTO seqs VALUES(?, ?, ?, ?, ?)", curr_row) if commit: self.commit()
[docs] def add_all_entries(self, other, check_meta=True, check_unique=True, commit=True): '''Add all entries from another ViReaDB database into this one Args: ``other`` (``vireadb.ViReaDB``): The other database from which to add all entries ``check_meta`` (``bool``): Check that the metadata are identical across the two databases. Should only be skipped if user is already guaranteed that they match ``check_unique`` (``bool``): Check that every ID is unique (i.e., no IDs in ``other`` already exist in the calling object). Should only be skipped if user is already guaranteed to not have duplicates ``commit`` (``bool``): Commit database after removing this entry ''' if not isinstance(other, type(self)): raise TypeError("Other database must be ViReaDB object, but it was: %s" % str(type(other))) if check_meta and self.get_meta() != other.get_meta(): raise TypeError("Metadata of the databases do not match") for curr_row in other: if check_unique and curr_row[0] in self: warn("Duplicate ID not added: %s" % curr_row[0]) else: self.cur.execute("INSERT INTO seqs VALUES(?, ?, ?, ?, ?)", curr_row) if commit: self.commit()
[docs] def del_reads(self, ID, confirm=True, commit=True): '''Remove the reads from a given entry in this database in order to save space. This should only be done if the counts have already been computed (and even then, this is strongly discouraged). Args: ``ID`` (``str``): The unique ID of the entry to remove ``confirm`` (``bool``): ``True`` to prompt the user for confirmation before removing the reads, otherwise ``False`` to remove reads silently (e.g. for automation) ``commit`` (``bool``): Commit database after removing this entry ''' tmp = self.cur.execute("SELECT CRAM, POS_COUNTS_XZ, INS_COUNTS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) cram, pos_counts_xz, ins_counts_xz = tmp if cram is None: raise RuntimeError("No reads in database for ID: %s" % ID) if pos_counts_xz is None or ins_counts_xz is None: raise RuntimeError("Base and insertion counts have not yet been computed for ID: %s" % ID) if confirm and not input("Removing reads from ID: %s\nThis is strongly discouraged and should only be done if you have already backed up the reads elsewhere.\nDo you want to continue? [y/N] " % ID).lower().startswith('y'): raise RuntimeError("Abort removing reads from ID: %s" % ID) self.cur.execute('UPDATE seqs SET CRAM=? WHERE ID=? LIMIT 1', (None, ID)) self.vacuum()
[docs] def del_entry(self, ID): '''Remove an entry to this database Args: ``ID`` (``str``): The unique ID of the entry to remove ''' self.cur.execute("DELETE FROM seqs WHERE ID='%s' LIMIT 1" % ID) self.vacuum()
[docs] def rename_entry(self, old_ID, new_ID, commit=True, vacuum=False): '''Rename an entry in this database Args: ``old_ID`` (``str``): The original ID of the entry to rename ``new_ID`` (``str``): The new ID to rename the entry ``commit`` (``bool``): Commit database after renaming this entry ``vacuum`` (``bool``): Vacuum database after renaming this entry (to minimize database filesize) ''' if new_ID in self: raise ValueError("ID already exists in database: %s" % new_ID) self.cur.execute("UPDATE seqs SET ID=? WHERE ID=? LIMIT 1", (new_ID, old_ID)) consensus = self.get_consensus(new_ID) if consensus is not None: consensus_xz = compress_str(">%s%s" % (new_ID, consensus.lstrip(">%s" % old_ID))) self.cur.execute("UPDATE seqs SET CONSENSUS_XZ=? WHERE ID=?", (consensus_xz, new_ID)) if commit: self.commit() if vacuum: self.vacuum()
[docs] def clear(self): '''Remove all entries from this database''' self.cur.execute("DELETE FROM seqs") self.commit() self.vacuum()
[docs] def get_entry(self, ID): '''Return the data of an entry associated with a given ID in this database Args: ``ID`` (``str``): The unique ID of the entry to retrieve Returns: ``bytes`` object containing the CRAM data of the reads ``numpy.array`` object containing the position counts ``dict`` object containing the insertion counts ``str`` object containing the consensus sequence ''' tmp = self.cur.execute("SELECT CRAM, POS_COUNTS_XZ, INS_COUNTS_XZ, CONSENSUS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) cram, pos_counts_xz, ins_counts_xz, consensus_xz = tmp return cram, decompress_pos_counts(pos_counts_xz), decompress_ins_counts(ins_counts_xz), decompress_str(consensus_xz)
[docs] def get_IDs(self): '''Return the IDs in this database Returns: ``list`` object containing all of the IDs in this database ''' return [row[0] for row in self.cur.execute("SELECT ID FROM seqs").fetchall()]
[docs] def compute_counts(self, ID, min_qual=DEFAULT_MIN_QUAL, bufsize=DEFAULT_BUFSIZE, overwrite=False, commit=True): '''Compute position and insertion counts for a given entry Args: ``ID`` (``str``): The unique ID of the entry whose counts to compute ``min_qual`` (``int``): Minimum base quality to count base ``bufsize`` (``int``): Buffer size for reading from file ``overwrite`` (``bool``): ``True`` to recompute (and overwrite) counts if they already exist ``commit`` (``bool``): Commit database after updating this entry ''' # check for validity tmp = self.cur.execute("SELECT CRAM, POS_COUNTS_XZ, INS_COUNTS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) cram_data, pos_counts_xz, ins_counts_xz = tmp if pos_counts_xz is not None and ins_counts_xz is not None and not overwrite: raise ValueError("Counts already exist for ID: %s" % ID) # pull CRAM and compute counts cram_f = NamedTemporaryFile('wb', prefix='vireadb', suffix='.cram', buffering=bufsize) cram_f.write(cram_data); cram_f.flush(); aln = open_aln(cram_f.name, self.ref_f.name, threads=1) pos_counts, ins_counts = compute_base_counts(aln, self.ref_len, min_qual=min_qual) cram_f.close() # compress and save counts pos_counts_xz = compress_pos_counts(pos_counts) ins_counts_xz = compress_ins_counts(ins_counts) self.cur.execute("UPDATE seqs SET POS_COUNTS_XZ=? WHERE ID=?", (pos_counts_xz, ID)) self.cur.execute("UPDATE seqs SET INS_COUNTS_XZ=? WHERE ID=?", (ins_counts_xz, ID)) if commit: self.commit()
[docs] def get_counts(self, ID): '''Return the position and insertion counts for a given entry Args: ``ID`` (``str``): The unique ID of the entry whose counts to return Returns: The position counts for ``ID`` (or ``None`` if not yet computed) The insertion counts for ``ID`` (or ``None`` if not yet computed) ''' tmp = self.cur.execute("SELECT POS_COUNTS_XZ, INS_COUNTS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) pos_counts_xz, ins_counts_xz = tmp pos_counts = decompress_pos_counts(pos_counts_xz) ins_counts = decompress_ins_counts(ins_counts_xz) return pos_counts, ins_counts
[docs] def compute_consensus(self, ID, min_depth=DEFAULT_MIN_DEPTH, min_freq=DEFAULT_MIN_FREQ, ambig=DEFAULT_AMBIG, remove_gaps=True, overwrite=False, commit=True): '''Compute the consensus sequence for a given entry. The position and insertion counts must have already been computed Args: ``ID`` (``str``): The unique ID of the entry whose counts to compute ``min_depth`` (``int``): Minimum depth to call base/insertion in consensus ``min_freq`` (``float``): Minimum frequency [0,1] to call base/insertion in consensus ``ambig`` (``str``): Symbol to use for ambiguous bases in consensus ``remove_gaps`` (``bool``): Remove gap characters (``-``) from consensus ``overwrite`` (``bool``): ``True`` to recompute (and overwrite) counts if they already exist ``commit`` (``bool``): Commit database after updating this entry ''' # check for validity tmp = self.cur.execute("SELECT POS_COUNTS_XZ, INS_COUNTS_XZ, CONSENSUS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) pos_counts_xz, ins_counts_xz, consensus_xz = tmp if pos_counts_xz is None or ins_counts_xz is None: raise RuntimeError("Must compute counts before computing consensus for ID: %s" % ID) if consensus_xz is not None and not overwrite: raise ValueError("Consensus already exists for ID: %s" % ID) # decompress counts, compute consensus, and save pos_counts = decompress_pos_counts(pos_counts_xz) ins_counts = decompress_ins_counts(ins_counts_xz) consensus_seq = compute_consensus(pos_counts, ins_counts, min_depth=min_depth, min_freq=min_freq, ambig=ambig) consensus = ">%s (vireadb v%s, min_depth=%s, min_freq=%s, ambig=%s, remove_gaps=%s)\n%s\n" % (ID, VERSION, min_depth, min_freq, ambig, remove_gaps, consensus_seq) consensus_xz = compress_str(consensus) self.cur.execute("UPDATE seqs SET CONSENSUS_XZ=? WHERE ID=?", (consensus_xz, ID)) if commit: self.commit()
[docs] def get_consensus(self, ID): '''Return the consensus sequence for a given entry Args: ``ID`` (``str``): The unique ID of the entry whose counts to return Returns: The consensus sequence for ``ID`` as a FASTA string (or ``None`` if not yet computed) ''' tmp = self.cur.execute("SELECT CONSENSUS_XZ FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) return decompress_str(tmp[0])
[docs] def export_cram(self, ID, out_fn, overwrite=False): '''Export the CRAM file of a given entry Args: ``ID`` (``str``): The unique ID of the entry whose CRAM to export ``out_fn`` (``str``): The path of the output CRAM file ``overwrite`` (``bool``): Overwrite output file if it exists ''' if isfile(out_fn) and not overwrite: raise ValueError("Output file exists: %s" % out_fn) tmp = self.cur.execute("SELECT CRAM FROM seqs WHERE ID='%s' LIMIT 1" % ID).fetchone() if tmp is None: raise KeyError("ID doesn't exist in database: %s" % ID) f = open(out_fn, 'wb'); f.write(tmp[0]); f.close()
[docs] def export_fasta(self, out_fn, IDs=None, overwrite=False): '''Export multiple consensus sequences as a FASTA file Args: ``out_fn`` (``str``): The path of the output FASTA file ``IDs`` (``list``): List of IDs whose consensus sequences to export, or ``None`` to export all consensus sequences in the database ``overwrite`` (``bool``): Overwrite output file if it exists ''' if isfile(out_fn) and not overwrite: raise ValueError("Output file exists: %s" % out_fn) if isinstance(IDs, str): IDs = [IDs] elif IDs is None: IDs = self.get_IDs() f = open(out_fn, 'w') for ID in IDs: try: seq = self.get_consensus(ID) except KeyError: warn("ID doesn't exist in database and was thus skipped: %s" % ID) if seq is None: warn("Consensus sequence hasn't been computed and was thus skipped: %s" % ID) else: f.write(seq) f.close()
[docs]def create_db(db_fn, ref_fn, overwrite=False, bufsize=DEFAULT_BUFSIZE): '''Create a new ViReaDB database Args: ``db_fn`` (``str``): The filename of the SQLite3 database file representing this database ``ref_fn`` (``str``): The filename of the viral reference genome to use for this database ``overwrite`` (``bool``): Overwrite ``db_fn`` if it already exists ``bufsize`` (``int``): Buffer size for reading from file Returns: ``ViReaDB`` object ''' # check valid inputs if not isfile(ref_fn): raise ValueError("File not found: %s" % ref_fn) if isdir(db_fn): raise ValueError("db_fn exists as a directory: %s" % db_fn) if isfile(db_fn): if overwrite: remove(db_fn) else: raise ValueError("db_fn exists: %s" % db_fn) # load reference genome ref_name, ref_seq = load_ref(ref_fn) ref_seq_xz = compress_str(ref_seq) # index reference genome mmi_data = index_ref_genome(ref_fn, bufsize=bufsize) if mmi_data is None: warn("minimap2 not found in PATH, so reference genome will not be indexed") # create SQLite3 database and populate with `meta` table con = sqlite3.connect(db_fn); cur = con.cursor() cur.execute("CREATE TABLE meta(%s)" % ', '.join(META_TABLE_COLS)) cur.execute("INSERT INTO meta VALUES(?, ?)", ('VERSION', VERSION)) cur.execute("INSERT INTO meta VALUES(?, ?)", ('REF_NAME', ref_name)) cur.execute("INSERT INTO meta VALUES(?, ?)", ('REF_SEQ_XZ', ref_seq_xz)) cur.execute("INSERT INTO meta VALUES(?, ?)", ('REF_MMI', mmi_data)) cur.execute("CREATE TABLE seqs(%s)" % ', '.join(SEQS_TABLE_COLS)) con.commit(); con.close() return ViReaDB(db_fn)
def index_ref_genome(ref_fn, bufsize=DEFAULT_BUFSIZE): '''Index a reference genome using minimap2 Args: ``ref_fn`` (``str``): The filename of the reference genome to index Returns: The minimap2 index as a ``bytes`` object ''' mmi_f = NamedTemporaryFile('w', prefix='vireadb', suffix='.mmi', buffering=bufsize) mmi_fn = mmi_f.name; mmi_f.close() try: call(BASE_COMMAND_MINIMAP2 + ['-d', mmi_fn, ref_fn], stdout=DEVNULL, stderr=DEVNULL) mmi_f = open(mmi_fn, 'rb'); mmi_data = mmi_f.read() mmi_f.close(); remove(mmi_fn) except FileNotFoundError: mmi_data = None return mmi_data
[docs]def load_db(db_fn): '''Load a ViReaDB database from file Args: ``db_fn`` (``str``): The filename of the SQLite3 database file representing this database Returns: ``ViReaDB`` object ''' if not isfile(db_fn): raise ValueError("db_fn not found: %s" % db_fn) return ViReaDB(db_fn)
[docs]def merge_dbs(out_db_fn, in_db_fns, check_meta=True, overwrite=False): '''Merge multiple ViReaDB databases Args: ``out_db_fn`` (``str``): The filename of the SQLite3 database file representing the output database ``in_db_fns`` (``list``): The filenames of the SQLite3 databases representing the input databases ``check_meta`` (``bool``): Check that the metadata are identical across the databases. Should only be skipped if user is already guaranteed that they match ``overwrite`` (``bool``): Overwrite ``db_fn`` if it already exists Returns: ``ViReaDB`` object ''' # validity check for in_db_fn in in_db_fns: if not isfile(in_db_fn): raise ValueError("Input database file not found: %s" % in_db_fn) if isfile(out_db_fn): if overwrite: remove(out_db_fn) else: raise ValueError("out_db_fn exists: %s" % out_db_fn) # merge databases copyfile(in_db_fns[0], out_db_fn) out_db = load_db(out_db_fn); out_db_meta = out_db.get_meta() for in_db_fn in in_db_fns[1:]: curr_db = load_db(in_db_fn) if check_meta and out_db_meta != curr_db.get_meta(): del out_db; remove(out_db_fn) raise TypeError("Metadata of the databases do not match") out_db.add_all_entries(curr_db) return out_db