This report documents the steps to prepare genotype data for imputation using the TOPMed Imputation Server:
Tool | Purpose | Install Command |
---|---|---|
PLINK | Quality control, format conversion | conda install -c bioconda plink |
vcf-sort | Sort VCF files (via htslib or vcftools) | Included in vcftools or use
bcftools sort |
bgzip | Compress VCF files to .vcf.gz |
conda install -c bioconda htslib |
tabix | Index compressed VCFs | conda install -c bioconda htslib |
Perl | Required to run HRC-1000G-check-bim.pl | Pre-installed on most systems |
File | Description | Command |
---|---|---|
HRC-1000G-check-bim.pl |
Script to harmonize SNP positions/alleles | |
HRC.r1-1.GRCh37.wgs.mac5.sites.tab |
Reference SNP list for QC |
plinkFile <- "ADNI_QC_FINAL"
dataDir <- getwd()
setwd(dataDir)
# Read BIM file
bim <- read.table(paste0(plinkFile, ".bim"), header = FALSE, stringsAsFactors = FALSE)
colnames(bim) <- c("CHR", "SNP", "CM", "BP", "A1", "A2")
# Filter chromosomes 1–22 and X
bim <- bim[bim$CHR %in% c(as.character(1:22), "X"), ]
# Filter alleles to A/C/G/T only
valid_alleles <- c("A", "C", "T", "G")
bim <- bim[bim$A1 %in% valid_alleles & bim$A2 %in% valid_alleles, ]
# Remove duplicated positions
dup_pos <- bim$BP[duplicated(bim$BP)]
bim <- bim[!bim$BP %in% dup_pos, ]
# Save valid SNPs
write.table(bim$SNP, "ValidSNPs.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
system(paste(
"plink --bfile", plinkFile,
"--extract ValidSNPs.txt --maf 0.00001 --recode --output-chr M --out", plinkFile
))
system(paste("plink --file", plinkFile, "--output-chr M --make-bed --out", plinkFile))
system(paste("plink --bfile", plinkFile, "--freq --out", plinkFile))
hrc_script <- "/path/to/HRC-1000G-check-bim.pl"
hrc_ref <- "/path/to/HRC.r1-1.GRCh37.wgs.mac5.sites.tab"
system(paste("perl", hrc_script,
"-h -r", hrc_ref,
"-b", paste0(plinkFile, ".bim"),
"-f", paste0(plinkFile, ".frq"),
"-c -p EUR -o"))
system("chmod 755 Run-plink.sh")
system("./Run-plink.sh")
for (i in 1:22) {
bim_chr <- read.table(paste0(plinkFile, "-updated-chr", i, ".bim"), header = FALSE)
write.table(bim_chr[, c(2, 6)], paste0("snps_", i, ".txt"),
quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
}
for i in {1..22}; do
vcf-sort ADNI1_QC_FINAL-updated-chr$i.vcf | bgzip -c > ADNI1-updated-chr$i.vcf.gz
tabix -p vcf ADNI1-updated-chr$i.vcf.gz
echo "Processed chr$i"
done
Each chromosome will produce:
ADNI1-updated-chr<i>.vcf.gz
ADNI1-updated-chr<i>.vcf.gz.tbi
These files can now be uploaded to the TOPMed or Michigan Imputation Server.