PySAM

0.23.3 · active · verified Fri Apr 10

Pysam is a Python module for reading, manipulating, and writing genomic datasets. It is a lightweight wrapper of the HTSlib API, providing facilities to work with SAM/BAM/CRAM, VCF/BCF, BED, GFF/GTF, FASTA/FASTQ files, and access samtools/bcftools command-line functionality. The module supports compression and random access through indexing. Pysam is actively maintained with regular releases, often wrapping new versions of the underlying htslib, samtools, and bcftools C libraries.

Warnings

Install

Imports

Quickstart

This quickstart demonstrates how to open and read data from common genomic file formats (SAM, VCF, FASTA) using `pysam.AlignmentFile`, `pysam.VariantFile`, and `pysam.FastaFile`. It includes creating dummy files for a self-contained example.

import pysam
import os

# --- Example 1: Read a BAM/SAM file ---
# Create a dummy BAM file for demonstration
# In a real scenario, you would use an existing BAM file and its index.
# For this quickstart, we'll create a simple unmapped SAM file first.
# NOTE: pysam.AlignmentFile requires a header for 'wb' mode.

header = {
    'HD': {'VN': '1.0'},
    'SQ': [{'LN': 1000, 'SN': 'chr1'}]
}

dummy_sam_path = "example_reads.sam"
with pysam.AlignmentFile(dummy_sam_path, "wh", header=header) as outfile:
    # Create a simple unmapped read (no reference_id or pos)
    read = pysam.AlignedSegment(header)
    read.query_name = "read1"
    read.query_sequence = "ATGCATGC"
    read.query_qualities = pysam.qualities_to_ints("BBBBBBBB")
    read.flag = 4 # UNMAPPED
    outfile.write(read)

print(f"Reading from {dummy_sam_path}:")
with pysam.AlignmentFile(dummy_sam_path, "r") as samfile:
    for read in samfile.fetch(until_eof=True): # fetch(until_eof=True) for unindexed or SAM files
        print(f"  Read: {read.query_name}, Sequence: {read.query_sequence}, Mapped: {not read.is_unmapped}")
os.remove(dummy_sam_path)

# --- Example 2: Read a VCF file ---
# Create a dummy VCF file
dummy_vcf_path = "example_variants.vcf"
with open(dummy_vcf_path, "w") as f:
    f.write("##fileformat=VCFv4.2\n")
    f.write("##contig=<ID=chr1,length=1000>\n")
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\n")
    f.write("chr1\t100\t.\tA\tG\t100\tPASS\t.\tGT\t0/1\n")
    f.write("chr1\t200\t.\tC\tT\t90\tPASS\t.\tGT\t1/1\n")

print(f"\nReading from {dummy_vcf_path}:")
vcf_file = pysam.VariantFile(dummy_vcf_path, "r")
for variant in vcf_file:
    print(f"  Variant: {variant.chrom}:{variant.pos} {variant.ref}>{variant.alts}")
vcf_file.close()
os.remove(dummy_vcf_path)

# --- Example 3: Read a FASTA file ---
# Create a dummy FASTA file
dummy_fasta_path = "example_reference.fasta"
with open(dummy_fasta_path, "w") as f:
    f.write(">chr1\n")
    f.write("ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\n")
    f.write(">chr2\n")
    f.write("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n")

print(f"\nReading from {dummy_fasta_path}:")
fasta_file = pysam.FastaFile(dummy_fasta_path)
sequence = fasta_file.fetch("chr1", 5, 15) # 0-based start, 0-based exclusive end
print(f"  Fetched sequence from chr1 (5-15): {sequence}")
fasta_file.close()
os.remove(dummy_fasta_path)

view raw JSON →