1、
dir() dat <- read.table("test.ped") ## 读取测试数据,ped基因型数据 dat genoList =list() for ( i in 1:ncol(dat) ) { ## 将每一列数据保存为列表的一项 genoList[[i]]<- dat[,i] } genoList length(genoList) a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]}) ## 将第一个等位基因拆分出来 a1 a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]}) ##将第二个等位基因拆分出来 a2 a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat))) ## 将第一个等位基因转换为数据框 a1 a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat))) ## 将第二个等位基因转换为数据框 a2 temp_list <- list() for (i in 1:ncol(dat)) { ## 将拆分出来的等位基因合并保存为新的列表 temp_list[[i * 2 - 1]] = a1[,i] temp_list[[i * 2]] = a2[,i] } temp_list result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat))) ## 将列表转换为数据框,实现拆分 result dat
> dir() [1] "test.ped" > dat <- read.table("test.ped") ## 测试数据 > dat V1 V2 V3 V4 V5 V6 1 GG CC GG GG GA AA 2 TT GC CC GG GG AA 3 TT GC CG GG GG TT 4 GG GC GG GG GG AA > genoList =list() > for ( i in 1:ncol(dat) ) { ## 保存为新列表 + genoList[[i]]<- dat[,i] + } > genoList [[1]] [1] "GG" "TT" "TT" "GG" [[2]] [1] "CC" "GC" "GC" "GC" [[3]] [1] "GG" "CC" "CG" "GG" [[4]] [1] "GG" "GG" "GG" "GG" [[5]] [1] "GA" "GG" "GG" "GG" [[6]] [1] "AA" "AA" "TT" "AA" > length(genoList) [1] 6 > a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]}) ## 拆分第一个等位基因 > a1 [[1]] [1] "G" "T" "T" "G" [[2]] [1] "C" "G" "G" "G" [[3]] [1] "G" "C" "C" "G" [[4]] [1] "G" "G" "G" "G" [[5]] [1] "G" "G" "G" "G" [[6]] [1] "A" "A" "T" "A" > a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]}) ## 拆分第二个等位基因 > a2 [[1]] [1] "G" "T" "T" "G" [[2]] [1] "C" "C" "C" "C" [[3]] [1] "G" "C" "G" "G" [[4]] [1] "G" "G" "G" "G" [[5]] [1] "A" "G" "G" "G" [[6]] [1] "A" "A" "T" "A" > a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat))) ## 转换为数据框 > a1 V1 V2 V3 V4 V5 V6 1 G C G G G A 2 T G C G G A 3 T G C G G T 4 G G G G G A > a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat))) ## 转换为数据框 > a2 V1 V2 V3 V4 V5 V6 1 G C G G A A 2 T C C G G A 3 T C G G G T 4 G C G G G A > temp_list <- list() > for (i in 1:ncol(dat)) { ## 合并在新列表中 + temp_list[[i * 2 - 1]] = a1[,i] + temp_list[[i * 2]] = a2[,i] + } > temp_list [[1]] [1] "G" "T" "T" "G" [[2]] [1] "G" "T" "T" "G" [[3]] [1] "C" "G" "G" "G" [[4]] [1] "C" "C" "C" "C" [[5]] [1] "G" "C" "C" "G" [[6]] [1] "G" "C" "G" "G" [[7]] [1] "G" "G" "G" "G" [[8]] [1] "G" "G" "G" "G" [[9]] [1] "G" "G" "G" "G" [[10]] [1] "A" "G" "G" "G" [[11]] [1] "A" "A" "T" "A" [[12]] [1] "A" "A" "T" "A" > result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat))) ## 转换为数据框 > result ## 查看结果 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 1 G G C C G G G G G A A A 2 T T G C C C G G G G A A 3 T T G C C G G G G G T T 4 G G G C G G G G G G A A > dat V1 V2 V3 V4 V5 V6 1 GG CC GG GG GA AA 2 TT GC CC GG GG AA 3 TT GC CG GG GG TT 4 GG GC GG GG GG AA
参考:https://zhuanlan.zhihu.com/p/378405836
2、shell实现
root@PC1:/home/test# ls test.ped root@PC1:/home/test# cat test.ped ## 测试数据 GG CC GG GG GA AA TT GC CC GG GG AA TT GC CG GG GG TT GG GC GG GG GG AA root@PC1:/home/test# sed 's/. / &/g' test.ped ## 使用sed对“字符空格“”替换为“空格字符空格” G G C C G G G G G A AA T T G C C C G G G G AA T T G C C G G G G G TT G G G C G G G G G G AA root@PC1:/home/test# sed 's/. / &/g' test.ped | sed 's/.$/ &/' ## 将最后一个字符替换为空格字符 G G C C G G G G G A A A T T G C C C G G G G A A T T G C C G G G G G T T G G G C G G G G G G A A