PySAM
Pysam is a Python module for reading, manipulating, and writing genomic datasets. It is a lightweight wrapper of the HTSlib API, providing facilities to work with SAM/BAM/CRAM, VCF/BCF, BED, GFF/GTF, FASTA/FASTQ files, and access samtools/bcftools command-line functionality. The module supports compression and random access through indexing. Pysam is actively maintained with regular releases, often wrapping new versions of the underlying htslib, samtools, and bcftools C libraries.
Warnings
- breaking Pysam v0.23.0 was the last release to officially support Python 3.6 and 3.7. Subsequent versions (v0.23.1 and later) require Python 3.8 or newer.
- breaking Pysam v0.20.0 was the final release to support Python 2.x. All versions after 0.20.0 are Python 3-only.
- gotcha When using `AlignmentFile.fetch()` or similar methods, coordinates are generally 0-based and half-open (exclusive end). This is a common convention in bioinformatics but can lead to off-by-one errors if 1-based or fully-inclusive ranges are expected.
- gotcha Pysam v0.23.1 inadvertently broke binary compatibility for Cython projects that depend on pysam. This was fixed in v0.23.2. Pure Python projects using pysam were not affected.
- gotcha While `pip install pysam` typically works by installing pre-built wheels (which include compiled htslib and do not require Cython), installing from source or a repository clone requires a C compiler and Cython to be pre-installed.
Install
-
pip install pysam -
conda install pysam -c bioconda
Imports
- pysam
import pysam
- AlignmentFile
from pysam import AlignmentFile
- VariantFile
from pysam import VariantFile
- FastaFile
from pysam import FastaFile
Quickstart
import pysam
import os
# --- Example 1: Read a BAM/SAM file ---
# Create a dummy BAM file for demonstration
# In a real scenario, you would use an existing BAM file and its index.
# For this quickstart, we'll create a simple unmapped SAM file first.
# NOTE: pysam.AlignmentFile requires a header for 'wb' mode.
header = {
'HD': {'VN': '1.0'},
'SQ': [{'LN': 1000, 'SN': 'chr1'}]
}
dummy_sam_path = "example_reads.sam"
with pysam.AlignmentFile(dummy_sam_path, "wh", header=header) as outfile:
# Create a simple unmapped read (no reference_id or pos)
read = pysam.AlignedSegment(header)
read.query_name = "read1"
read.query_sequence = "ATGCATGC"
read.query_qualities = pysam.qualities_to_ints("BBBBBBBB")
read.flag = 4 # UNMAPPED
outfile.write(read)
print(f"Reading from {dummy_sam_path}:")
with pysam.AlignmentFile(dummy_sam_path, "r") as samfile:
for read in samfile.fetch(until_eof=True): # fetch(until_eof=True) for unindexed or SAM files
print(f" Read: {read.query_name}, Sequence: {read.query_sequence}, Mapped: {not read.is_unmapped}")
os.remove(dummy_sam_path)
# --- Example 2: Read a VCF file ---
# Create a dummy VCF file
dummy_vcf_path = "example_variants.vcf"
with open(dummy_vcf_path, "w") as f:
f.write("##fileformat=VCFv4.2\n")
f.write("##contig=<ID=chr1,length=1000>\n")
f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\n")
f.write("chr1\t100\t.\tA\tG\t100\tPASS\t.\tGT\t0/1\n")
f.write("chr1\t200\t.\tC\tT\t90\tPASS\t.\tGT\t1/1\n")
print(f"\nReading from {dummy_vcf_path}:")
vcf_file = pysam.VariantFile(dummy_vcf_path, "r")
for variant in vcf_file:
print(f" Variant: {variant.chrom}:{variant.pos} {variant.ref}>{variant.alts}")
vcf_file.close()
os.remove(dummy_vcf_path)
# --- Example 3: Read a FASTA file ---
# Create a dummy FASTA file
dummy_fasta_path = "example_reference.fasta"
with open(dummy_fasta_path, "w") as f:
f.write(">chr1\n")
f.write("ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n")
f.write(">chr2\n")
f.write("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n")
print(f"\nReading from {dummy_fasta_path}:")
fasta_file = pysam.FastaFile(dummy_fasta_path)
sequence = fasta_file.fetch("chr1", 5, 15) # 0-based start, 0-based exclusive end
print(f" Fetched sequence from chr1 (5-15): {sequence}")
fasta_file.close()
os.remove(dummy_fasta_path)