R语言将没有空格的列数据拆分为空格分割

1、

dir()
dat <- read.table("test.ped")   ## 读取测试数据,ped基因型数据
dat
genoList =list()
for ( i in 1:ncol(dat) ) {      ## 将每一列数据保存为列表的一项
  genoList[[i]]<- dat[,i]
}
genoList

length(genoList)
a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]})  ## 将第一个等位基因拆分出来
a1
a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]})   ##将第二个等位基因拆分出来
a2
a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat)))      ## 将第一个等位基因转换为数据框
a1
a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat)))      ## 将第二个等位基因转换为数据框
a2
 
temp_list <- list()

for (i in 1:ncol(dat)) {             ## 将拆分出来的等位基因合并保存为新的列表
  temp_list[[i * 2 - 1]] = a1[,i]
  temp_list[[i * 2]] = a2[,i]
}
temp_list

result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat)))   ## 将列表转换为数据框,实现拆分
result
dat
> dir()
[1] "test.ped"
> dat <- read.table("test.ped")   ## 测试数据
> dat
  V1 V2 V3 V4 V5 V6
1 GG CC GG GG GA AA
2 TT GC CC GG GG AA
3 TT GC CG GG GG TT
4 GG GC GG GG GG AA
> genoList =list()
> for ( i in 1:ncol(dat) ) {     ## 保存为新列表
+   genoList[[i]]<- dat[,i]
+ }
> genoList
[[1]]
[1] "GG" "TT" "TT" "GG"

[[2]]
[1] "CC" "GC" "GC" "GC"

[[3]]
[1] "GG" "CC" "CG" "GG"

[[4]]
[1] "GG" "GG" "GG" "GG"

[[5]]
[1] "GA" "GG" "GG" "GG"

[[6]]
[1] "AA" "AA" "TT" "AA"

> length(genoList)
[1] 6
> a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]})   ## 拆分第一个等位基因
> a1
[[1]]
[1] "G" "T" "T" "G"

[[2]]
[1] "C" "G" "G" "G"

[[3]]
[1] "G" "C" "C" "G"

[[4]]
[1] "G" "G" "G" "G"

[[5]]
[1] "G" "G" "G" "G"

[[6]]
[1] "A" "A" "T" "A"

> a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]})  ## 拆分第二个等位基因
> a2
[[1]]
[1] "G" "T" "T" "G"

[[2]]
[1] "C" "C" "C" "C"

[[3]]
[1] "G" "C" "G" "G"

[[4]]
[1] "G" "G" "G" "G"

[[5]]
[1] "A" "G" "G" "G"

[[6]]
[1] "A" "A" "T" "A"

> a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat)))  ## 转换为数据框
> a1
  V1 V2 V3 V4 V5 V6
1  G  C  G  G  G  A
2  T  G  C  G  G  A
3  T  G  C  G  G  T
4  G  G  G  G  G  A
> a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat)))  ## 转换为数据框
> a2
  V1 V2 V3 V4 V5 V6
1  G  C  G  G  A  A
2  T  C  C  G  G  A
3  T  C  G  G  G  T
4  G  C  G  G  G  A
> temp_list <- list()
> for (i in 1:ncol(dat)) {          ## 合并在新列表中
+   temp_list[[i * 2 - 1]] = a1[,i]
+   temp_list[[i * 2]] = a2[,i]
+ }
> temp_list
[[1]]
[1] "G" "T" "T" "G"

[[2]]
[1] "G" "T" "T" "G"

[[3]]
[1] "C" "G" "G" "G"

[[4]]
[1] "C" "C" "C" "C"

[[5]]
[1] "G" "C" "C" "G"

[[6]]
[1] "G" "C" "G" "G"

[[7]]
[1] "G" "G" "G" "G"

[[8]]
[1] "G" "G" "G" "G"

[[9]]
[1] "G" "G" "G" "G"

[[10]]
[1] "A" "G" "G" "G"

[[11]]
[1] "A" "A" "T" "A"

[[12]]
[1] "A" "A" "T" "A"

> result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat)))   ## 转换为数据框
> result   ## 查看结果
  V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
1  G  G  C  C  G  G  G  G  G   A   A   A
2  T  T  G  C  C  C  G  G  G   G   A   A
3  T  T  G  C  C  G  G  G  G   G   T   T
4  G  G  G  C  G  G  G  G  G   G   A   A
> dat
  V1 V2 V3 V4 V5 V6
1 GG CC GG GG GA AA
2 TT GC CC GG GG AA
3 TT GC CG GG GG TT
4 GG GC GG GG GG AA

参考:https://zhuanlan.zhihu.com/p/378405836

2、shell实现

root@PC1:/home/test# ls
test.ped
root@PC1:/home/test# cat test.ped   ## 测试数据
GG CC GG GG GA AA
TT GC CC GG GG AA
TT GC CG GG GG TT
GG GC GG GG GG AA
root@PC1:/home/test# sed 's/. / &/g' test.ped   ## 使用sed对“字符空格“”替换为“空格字符空格”
G G C C G G G G G A AA
T T G C C C G G G G AA
T T G C C G G G G G TT
G G G C G G G G G G AA
root@PC1:/home/test# sed 's/. / &/g' test.ped | sed 's/.$/ &/'  ## 将最后一个字符替换为空格字符
G G C C G G G G G A A A
T T G C C C G G G G A A
T T G C C G G G G G T T
G G G C G G G G G G A A
原文地址:https://www.cnblogs.com/liujiaxin2018/p/15709097.html