Bioinformatics notebook

Code chunks that I’ve found repeatedly useful when dealing with genomic data.


BCFtools

#bcftools v1.21

# Count how many variants in a VCF
bcftools view -H file.vcf.gz | wc -l
#-H = no header

# Count how many chromosomes/scaffolds in a VCF
bcftools query -f '%CHROM\n' file.vcf.gz | uniq | wc -l

# Get a list of variants in a VCF (CHROM, POS, ID)
bcftools query -f '%CHROM\t%POS\t%ID\n' file.vcf.gz

# Get list of individuals in a VCF
bcftools query -l file.vcf.gz

# Pull all information for one variant
bcftools view -H -i 'ID=="SNPID"' file.vcf.gz

# Pull all information for one sample
bcftools view -s SampleID file.vcf.gz

# Remove or keep a list of variants from a VCF (list is newline-separated)
bcftools view -e 'ID == @remove.txt' file.vcf.gz -Oz -o out.vcf.gz
bcftools view -i 'ID == @keep.txt' file.vcf.gz -Oz -o out.vcf.gz

# Pull only CHROM, POS, ID, REF, ALT, and GT
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%GT]\n' file.vcf.gz > genotype_table.txt

# Create a general site summary table (will be very large!)
echo -e "CHROM\tPOS\tID\tREF\tALT\tGT:AD:DP:GQ" > summary_table.txt
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT[\t%GT:%AD:%DP:%GQ]\n' file.vcf.gz >> summary_table.txt

Samtools

#samtools v1.21

# SAM to BAM
samtools view -S -b samfile.sam > bamfile.bam
# BAM to SAM
samtools view -h -o samfile.sam bamfile.bam

# Get mean read depth from an alignment
samtools depth -a file.bam | awk '{c++;s+=$3}END{print s/c}'

# Extract alignments that only mapped to a specific chromosome
samtools view -b file.bam "Chr1" > Chr1_alns.bam

# Extract a specific region from a FASTA
samtools faidx file.fasta #index first
samtools faidx file.fasta Chr1:1-1000 > Chr1_1-1000.fasta

# Extract chromosome/scaffold names and lengths, sort by length in descending order
samtools faidx file.fasta #index first
cut -f1-2 file.fai | sort -n -r -k 2 > scaffold_lengths.txt

General

# Extract all chromosome/scaffold names from a FASTA file
grep "^>" mygenome.fasta > list_of_scaffolds.txt

# Search a directory + sub-directories for a file name containing a string
# Example: find all files ending in ".bam"
find . -type f -name "*.bam"

# Execute a command for all files in a directory + sub-directories whose name matches a certain pattern
find . -type f -name "pattern" -exec [COMMAND] {} [TARGET] \;
# Example: find all files in current directory ending in ".bam" and move to a directory called target/
find . -type f -name "*.bam" -exec mv {} target/ \;

# Create a symlink
ln -s /path/to/original/file /path/to/target/folder