# # Exact test for HWE: 2 alleles # # See eg Emigh TH. Comparison of tests for Hardy-Weinberg Equilibrium. # Biometrics 1980; 36: 627-642 # hwe.table <- function (loc,loc.2=NULL,trait=NULL) { if (!is.null(loc.2)) loc=paste(loc,loc.2,sep="/") counts<-table(loc) if (!is.null(trait)) x.counts<-table(trait,loc) gtp<-unlist(dimnames(counts)) alleles<-sort(unique(unlist(strsplit(gtp,"/")))) all.genos<-outer(alleles,alleles,paste,sep="/") all.genos<-all.genos[!lower.tri(all.genos)] if (is.null(trait)) { fullcount<-integer(length(all.genos)) names(fullcount)<-all.genos fullcount[all.genos %in% gtp]<-counts }else{ fullcount<-matrix(ncol=length(all.genos), nrow=nrow(x.counts)) colnames(fullcount)<-all.genos rownames(fullcount)<-rownames(x.counts) gtp<-dimnames(x.counts)[[2]] for(i in 1:nrow(fullcount)) { fullcount[i,all.genos %in% gtp]<-x.counts[i,] } } fullcount } dhwe2 <- function(n11, n12, n22) { f <- function(x) lgamma(x+1) n <- n11+n12+n22 n1 <- 2*n11+n12 n2 <- 2*n22+n12 exp(log(2)*(n12) + f(n) - f(n11) - f(n12) - f(n22) - f(2*n) + f(n1) + f(n2)) } hwe2.test <- function(n11, n12=NULL, n22=NULL) { if (length(n11)==3 && is.null(n12) && is.null(n22)) { n12<-n11[2] n22<-n11[3] n11<-n11[1] } n1 <- 2*n11+n12 n2 <- 2*n22+n12 x12 <- seq(n1 %% 2,min(n1,n2),2) x11 <- (n1-x12)/2 x22 <- (n2-x12)/2 dist <- data.frame(n11=x11,n12=x12,n22=x22,density=dhwe2(x11,x12,x22)) dist <- dist[order(dist$density),] data.frame(n11,n12,n22, freq=n1/(n1+n2), prob=cumsum(dist$density)[dist$n11==n11 & dist$n12==n12 & dist$n22==n22]) } ntcode<-function(x, drop=FALSE) { gtp<-as.character(x) bad <- nchar(gtp)>2 | gtp=="" gtp[bad] <- NA idx<-nchar(gtp) res<-ifelse(idx==1, paste(gtp,gtp,sep="/"), ifelse(is.na(x), NA, ifelse(idx==2, {a1<-substr(gtp,1,1); a2<-substr(gtp,2,2); ifelse(a1>a2, paste(a2,a1,sep="/"), paste(a1,a2,sep="/") ) }, NA ) ) ) if (drop) { factor(res) }else{ alleles<-sort(unique(unlist(strsplit(gtp[!is.na(gtp)],"")))) alleles <- alleles[alleles!=" "] all.genos<-outer(alleles,alleles,paste,sep="/") all.genos<-all.genos[!lower.tri(all.genos)] factor(res,levels=all.genos) } } count.miss <- function(x) { if (is.data.frame(x) || is.matrix(x) || (is.array(x) && length(dim(x))==2)) { res<-apply(x,1,function(x) sum(is.na(x))) }else{ stop("Expected a rectangular data structure") } res } # # Convert "1/2" to 1,2 # geno.as.array <- function(genotypes,renumber=FALSE,miss=NULL,gtp.sep="/") { mknum<-function(genotypes, renumber=FALSE, gtp.sep="/") { alleles<- strsplit(as.character(genotypes), gtp.sep) gtp<-cbind(sapply(alleles, function(x) x[1], simplify=TRUE), sapply(alleles, function(x) x[2], simplify=TRUE)) if (renumber) { alleles<-unique(unlist(alleles)) gtp[,1]<-as.numeric(factor(gtp[,1],levels=alleles)) gtp[,2]<-as.numeric(factor(gtp[,2],levels=alleles)) } if (is.null(miss)) { gtp[is.na(genotypes),]<-NA }else{ gtp[is.na(genotypes),]<-miss } gtp } if (any(unlist(lapply(genotypes, function(x) regexpr(gtp.sep, as.character(x[!is.na(x)]))<0)))) { stop(paste("Expecting genotype data of form a", gtp.sep, "b!", sep="")) } if (is.null(ncol(genotypes)) || ncol(genotypes)==1) { res<-mknum(genotypes, renumber=renumber) }else{ res<-data.frame(mknum(genotypes[,1], renumber=renumber)) for(i in 2:ncol(genotypes)) { res<-cbind(res,mknum(genotypes[,i], renumber=renumber)) } colnames(res)<-c(t(outer(names(genotypes),1:2,paste,sep="."))) } res } describe.snp <- function(snp, trait=NULL) { if (is.null(trait)) { prop<-prop.table(count<-hwe.table(snp)) minor.allele<-strsplit(names(prop)[which.min(prop)],"/")[[1]][1] prop<-0.5*prop[2]+min(prop[-2]) names(prop)<-minor.allele hwe.p<-hwe2.test(count)$prob homog.p<-NA }else{ count<-hwe.table(snp) minor<-which.min(count) minor.allele<-strsplit(names(count)[minor],"/")[[1]][1] prop<-prop.table(count<-hwe.table(snp,trait=trait),1) prop<-0.5*prop[,2]+prop[,minor] hwe.p<-apply(count,1,function(x) hwe2.test(x)$prob) homog.p<-chisq.test(table(trait, snp), sim=TRUE, B=100000)$p.value } res<-list() res$snp<-deparse(match.call()$snp) res$trait<-deparse(match.call()$trait) res$counts<-count res$minor.allele<-minor.allele res$allele.frequency<-prop res$HWE.exact.p<-hwe.p res$gtp.homog.p<-homog.p class(res)<-"describe.snp" res } print.describe.snp <- function(x, ...) { cat("\nSNP: ",x$snp) if (x$trait!="NULL") cat(" versus ",x$trait) cat(" Total genotyped =",sum(x$counts),"\n\n") head<-paste(" Minor Allele (",x$minor.allele,") ",sep="") if (x$trait!="NULL") { tab<-cbind(x$counts,round(x$allele.frequency,3),round(x$HWE.exact.p,3)) }else{ tab<-t(rbind(as.matrix(x$counts),round(x$allele.frequency,3), round(x$HWE.exact.p,3))) rownames(tab)<-"" } colnames(tab)[4:5]<-c(head, "HWE Exact P") print(tab,quote=FALSE, ...) if (x$trait!="NULL") cat("\nGenotypic homogeneity P=",x$gtp.homog.p, " (based on 10^5 simulations).\n") } hglm <- function(trait, geno, family=binomial()) { require(haplo.stats) res<-haplo.score(trait, geno.as.array(geno), trait.type=family$family, locus.label=names(geno), skip.haplo=0.02, simulate=TRUE) cat("Global Score=", res$score.global," (df=", res$df,"; sim P=", res$score.global.p.sim,")\n", sep="") gtp<-setupGeno(geno.as.array(geno)) res<-haplo.glm(trait ~ gtp, family=family, allele.lev=attributes(gtp)$unique.alleles, locus.label=names(geno), control = haplo.glm.control(haplo.freq.min=0.02)) cat(" LRTS=", res$lrt$lrt," (df=", res$lrt$df,"; P=", pchisq(res$lrt$lrt,res$lrt$df,lower=FALSE),")\n", sep="") res } # # best haplotype from haplo.em # best.haps <- function(x) { imputed <- data.frame(id=x$subj.id, hap=paste(x$hap1code, x$hap2code, sep="/"), post=x$post) hap.codelist <- apply(x$haplotype,1,paste,collapse="") which.hap <- by(imputed$post, imputed$id, which.max) which.hap <- which.hap+match(names(which.hap), imputed$id)-1 imputed[which.hap, ] } # # draw exon structure of gene on existing plot (plus SNPs) # draw.gene <- function(gene, height=0.1) { require(RSNPper) gene.str <- geneLayout(geneInfo(gene)[1]) exon.start <- as.numeric(gene.str[grep("exon[0-9]+.start",names(gene.str))]) exon.end <- as.numeric(gene.str[grep("exon[0-9]+.end",names(gene.str))]) nex <- length(exon.start) rect(exon.start[1], 0, exon.end[nex], height, col="yellow") rect(exon.start, rep(0,nex), exon.end, rep(height, nex), col="red") } # # Read MERLIN .dat, .map and .ped files # read.merlin <- function(prefix="merlin", pedfile=paste(prefix,".ped",sep=""), datfile=paste(prefix,".dat",sep=""), mapfile=paste(prefix,".map",sep=""), uniqueid=FALSE) { if (!file.exists(datfile)) { stop(paste("Cannot find",datfile)) }else if (!file.exists(pedfile)) { stop(paste("Cannot find",pedfile)) } loci <- read.table(datfile, colClasses="character") nloci <- nrow(loci) locpos <- rep(1, nloci) locpos[loci[,1] %in% c("M")] <- 2 locpos <- c(6,6+cumsum(locpos[-nloci])) width <- locpos[nrow(loci)] markers <- locpos[loci[,1] %in% c("M")] loctyp <- rep("numeric", width) loctyp[c(1:5, sort(c(markers,markers+1)))] <- "character" ped <- read.table(pedfile, na.string=c("-","x"), colClasses=loctyp) for(i in markers) { ped[ped[,i]==0, i] <- NA idx <- !is.na(ped[,i]) ped[idx,i] <- paste(ped[idx,i], ped[idx,i+1], sep="/") } ped <- ped[, c(1:5,locpos)] names(ped) <- c("pedigree","id","fa","mo","sex", loci[,2]) if (uniqueid) { founder <- is.na(ped[,3]) | ped[,3]=="0" id <- paste(ped[,1], ped[,2]) fa <- paste(ped[,1], ped[,3]) fa[founder] <- NA mo <- paste(ped[,1], ped[,4]) mo[founder] <- NA masterids <- unique(c(id, fa, mo)) newid <- match(id, masterids) newmo <- newfa <- integer(length(newid)) newfa[!founder] <- match(fa[!founder], masterids) newmo[!founder] <- match(mo[!founder], masterids) ped$id <- newid ped$fa <- newfa ped$mo <- newmo } ped }