Source code for deeprm.utils.utils
"""
Utility functions for DeepRM.
"""
import numpy as np
[docs]
def mean_phred(phred):
"""
Calculates the mean Phred quality score.
Args:
phred (numpy.ndarray): Array or list of Phred quality scores.
Returns:
float: Mean Phred quality score.
"""
if not isinstance(phred, np.ndarray):
phred = np.array(phred, dtype=int)
else:
phred = phred.astype(int)
return -10 * np.log10(np.mean(10 ** (-phred / 10)))
[docs]
def maybe_index_bam(bam_path: str, threads: int = 1) -> str:
"""
Ensure a BAM is indexed. If indexing fails because the BAM is not coordinate-sorted,
write a coordinate-sorted copy (<stem>.sorted[.N].bam), index it, and return that path.
Returns:
Path (as str) to the BAM that is guaranteed to be indexed (original or sorted copy).
"""
from pathlib import Path
import psutil
import pysam
bam_path = Path(bam_path)
if not bam_path.exists():
raise FileNotFoundError(f"BAM not found: {bam_path}")
if bam_path.suffix.lower() != ".bam":
raise ValueError(f"Expected a .bam file, got: {bam_path.name}")
def _has_usable_index(p) -> bool:
try:
with pysam.AlignmentFile(str(p), "rb", check_sq=False) as bam:
# has_index(): index file discoverable; check_index(): validate it matches & is readable
return bool(bam.has_index()) and bool(bam.check_index())
except Exception:
return False
# Fast path: already indexed & usable
if _has_usable_index(bam_path):
return str(bam_path)
# Before attempting to index, ensure the BAM has @SQ. Without it, indexing is not possible.
try:
with pysam.AlignmentFile(str(bam_path), "rb", check_sq=False) as bam:
sq = bam.header.get("SQ", None)
if not sq:
raise ValueError(f"{bam_path} has no @SQ lines in the header; cannot build a BAM index.")
except Exception as e:
# Re-raise with a clearer message
raise RuntimeError(f"Failed to read BAM header: {bam_path}") from e
# Try indexing the BAM as-is. If it fails due to unsortedness, fall back to sorting.
try:
pysam.index(str(bam_path), nthreads=threads)
if _has_usable_index(bam_path):
return str(bam_path)
# If indexing "succeeded" but index still isn't usable, treat as failure.
raise RuntimeError("Index creation did not produce a usable index.")
except Exception as e:
err_msg = str(e).lower()
print(f"Indexing failed for {bam_path}: {err_msg}")
# Sort to a new file, then index that.
out_base = bam_path.with_suffix("") # strip .bam
sorted_path = out_base.with_name(out_base.name + ".sorted").with_suffix(".bam")
if sorted_path.exists():
# Make a unique name: .sorted.1.bam, .sorted.2.bam, ...
maxretcount = 100
i = 1
while i <= maxretcount:
candidate = out_base.with_name(f"{out_base.name}.sorted.{i}").with_suffix(".bam")
if not candidate.exists():
sorted_path = candidate
break
i += 1
pysam_thread_memory = max(1, int(0.5 * psutil.virtual_memory().available / (1024**2) / threads))
pysam.sort("-@", str(threads), f"-m {pysam_thread_memory}M", "-o", str(sorted_path), str(bam_path))
pysam.index("-@", str(threads), str(sorted_path))
if not _has_usable_index(sorted_path):
raise RuntimeError(f"Sorted BAM was created but index is not usable: {sorted_path}")
return str(sorted_path)