RandomForest&ROC

  1 # -*- coding: utf-8 -*-
  2 # __author__ = 'JieYao'
  3 from biocluster.agent import Agent
  4 from biocluster.tool import Tool
  5 import os
  6 import types
  7 import subprocess
  8 from biocluster.core.exceptions import OptionError
  9 
 10 
 11 class RandomforestAgent(Agent):
 12     """
 13     需要RandomForest.pl
 14     version v1.0
 15     author: JieYao
 16     last_modified:2016.07.18
 17     """
 18 
 19     def __init__(self, parent):
 20         super(RandomforestAgent, self).__init__(parent)
 21         options = [
 22             {"name": "otutable", "type": "infile", "format": "meta.otu.otu_table, meta.otu.tax_summary_dir"},
 23             {"name": "level", "type": "string", "default": "otu"},
 24             {"name": "envtable", "type": "infile", "format": "meta.otu.group_table"},
 25             {"name": "envlabs", "type": "string", "default": ""},
 26             {"name": "ntree", "type": "int", "default": 500 },
 27             {"name": "problem_type", "type": "int", "default": 2 },
 28             {"name": "top_number", "type": "int", "default": 50}
 29         ]
 30         self.add_option(options)
 31         self.step.add_steps('RandomforestAnalysis')
 32         self.on('start', self.step_start)
 33         self.on('end', self.step_end)
 34 
 35     def step_start(self):
 36         self.step.RandomforestAnalysis.start()
 37         self.step.update()
 38 
 39     def step_end(self):
 40         self.step.RandomforestAnalysis.finish()
 41         self.step.update()
 42 
 43     def gettable(self):
 44         """
 45         根据输入的otu表和分类水平计算新的otu表
 46         :return:
 47         """
 48         if self.option('otutable').format == "meta.otu.tax_summary_dir":
 49             return self.option('otutable').get_table(self.option('level'))
 50         else:
 51             return self.option('otutable').prop['path']
 52         
 53     def check_options(self):
 54         """
 55         重写参数检查
 56         """
 57         if not self.option('otutable').is_set:
 58             raise OptionError('必须提供otu表')
 59         self.option('otutable').get_info()
 60         if self.option('otutable').prop['sample_num'] < 2:
 61             raise OptionError('otu表的样本数目少于2,不可进行随机森林特征分析')
 62         if self.option('envtable').is_set:
 63             self.option('envtable').get_info()
 64             if self.option('envlabs'):
 65                 labs = self.option('envlabs').split(',')
 66                 for lab in labs:
 67                     if lab not in self.option('envtable').prop['group_scheme']:
 68                         raise OptionError('envlabs中有不在物种(环境因子)表中存在的因子:%s' % lab)
 69             else:
 70                 pass
 71             if len(self.option('envtable').prop['sample']) < 2:
 72                 raise OptionError('物种(环境因子)表的样本数目少于2,不可进行随机森林特征分析')
 73         samplelist = open(self.gettable()).readline().strip().split('	')[1:]
 74         if self.option('envtable').is_set:
 75             self.option('envtable').get_info()
 76             if len(self.option('envtable').prop['sample']) > len(samplelist):
 77                 raise OptionError('OTU表中的样本数量:%s少于物种(环境因子)表中的样本数量:%s' % (len(samplelist),
 78                                   len(self.option('envtable').prop['sample'])))
 79             for sample in self.option('envtable').prop['sample']:
 80                 if sample not in samplelist:
 81                     raise OptionError('物种(环境因子)表的样本中存在OTU表中未知的样本%s' % sample)
 82         table = open(self.gettable())
 83         if len(table.readlines()) < 4 :
 84             raise OptionError('数据表信息少于3行')
 85         table.close()
 86         if self.option('top_number') > self.option('otutable').prop['otu_num']:
 87             self.option('top_number', self.option('otutable').prop['otu_num'])
 88         return True
 89     
 90     def set_resource(self):
 91         """
 92         设置所需资源
 93         """
 94         self._cpu = 2
 95         self._memory = ''
 96     
 97     def end(self):
 98         result_dir = self.add_upload_dir(self.output_dir)
 99         result_dir.add_relpath_rules([
100             [".", "", "RandomForest分析结果和ROC计算数据输出目录"],
101             ["./randomforest_confusion_table.xls", "xls", "RandomForest样本分组模拟结果"],
102             ["./randomforest_mds_sites.xls", "xls", "样本点坐标表"],
103             ["./randomforest_proximity_table.xls", "xls", "样本相似度临近矩阵"],
104             ["./randomforest_topx_vimp.xls", "xls", "Top-X物种(环境因子)丰度表"],
105             ["./randomforest_vimp_table.xls", "xls", "所有物种(环境因子)重要度表"],
106             ["./randomforest_predicted_answer.xls", "xls", "随机森林预测分组结果表"],
107             ["./randomforest_votes_probably.xls","xls", "随机森林各样本分组投票预测概率表"],
108             ["./roc_table.xls", "xls", "ROC数据标准化后数据表"],
109             ["./roc_point.xls", "xls", "ROC作图坐标点数据表"],
110             ["./auc.xls", "xls", "ROC折线下方面积值"],
111         ])
112         print self.get_upload_files()
113         super(RandomforestAgent, self).end()
114 
115         
116 class RandomforestTool(Tool):
117     def __init__(self, config):
118         super(RandomforestTool, self).__init__(config)
119         self._version = '1.0.1'
120         self.cmd_path = self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/RandomForest_perl.pl'
121         self.env_table = self.get_new_env()
122         self.otu_table = self.get_otu_table()
123     
124     def get_otu_table(self):
125         """
126         根据调用的level参数重构otu表
127         :return:
128         """
129         if self.option('otutable').format == "meta.otu.tax_summary_dir":
130             otu_path = self.option('otutable').get_table(self.option('level'))
131         else:
132             otu_path = self.option('otutable').prop['path']
133         if self.option('envtable').is_set:
134             return self.filter_otu_sample(otu_path, self.option('envtable').prop['sample'],
135                                           os.path.join(self.work_dir, 'temp_filter.otutable'))
136         else:
137             return otu_path
138     
139     def filter_otu_sample(self, otu_path, filter_samples, newfile):
140         if not isinstance(filter_samples, types.ListType):
141             raise Exception('过滤otu表样本的样本名称应为列表')
142         try:
143             with open(otu_path, 'rb') as f, open(newfile, 'wb') as w:
144                 one_line = f.readline()
145                 all_samples = one_line.rstrip().split('	')[1:]
146                 if not ((set(all_samples) & set(filter_samples)) == set(filter_samples)):
147                     raise Exception('提供的过滤样本集合中存在otu表中不存在的样本all:%s,filter_samples:%s' % (all_samples, filter_samples))
148                 if len(all_samples) == len(filter_samples):
149                     return otu_path
150                 samples_index = [all_samples.index(i) + 1 for i in filter_samples]
151                 w.write('OTU	' + '	'.join(filter_samples) + '
')
152                 for line in f:
153                     all_values = line.rstrip().split('	')
154                     new_values = [all_values[0]] + [all_values[i] for i in samples_index]
155                     w.write('	'.join(new_values) + '
')
156                 return newfile
157         except IOError:
158             raise Exception('无法打开OTU相关文件或者文件不存在')
159 
160     def get_new_env(self):
161         """
162         根据envlabs生成新的envtable
163         """
164         if self.option('envlabs'):
165             new_path = self.work_dir + '/temp_env_table.xls'
166             self.option('envtable').sub_group(new_path, self.option('envlabs').split(','))
167             return new_path
168         else:
169             return self.option('envtable').path
170     
171     def run(self):
172         """
173         运行
174         """
175         super(RandomforestTool, self).run()
176         self.run_RandomForest_perl()
177     
178     def formattable(self, tablepath):
179         with open(tablepath) as table:
180             if table.read(1) == '#':
181                 newtable = os.path.join(self.work_dir, 'temp_format.table')
182                 with open(newtable, 'w') as w:
183                     w.write(table.read())
184                 return newtable
185         return tablepath
186     
187     def run_RandomForest_perl(self):
188         """
189         运行RandomForest.pl
190         """
191         real_otu_path = self.formattable(self.otu_table)
192         cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.cmd_path
193         cmd += ' -i %s -o %s' % (real_otu_path, self.work_dir + '/RandomForest')
194         if self.option('envtable').is_set:
195             cmd += ' -g %s -m %s' % (self.env_table, self.env_table)
196         cmd += ' -ntree %s' % (str(self.option('ntree')))
197         cmd += ' -type %s' % (str(self.option('problem_type')))
198         cmd += ' -top %s' % (str(self.option('top_number')))
199         self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算')
200         
201         try:
202             subprocess.check_output(cmd, shell=True)
203             self.logger.info('生成 cmd.r 文件成功')
204         except subprocess.CalledProcessError:
205             self.logger.info('生成 cmd.r 文件失败')
206             self.set_error('无法生成 cmd.r 文件')
207         try:
208             subprocess.check_output(self.config.SOFTWARE_DIR + 
209                                     '/program/R-3.3.1/bin/R --restore --no-save < %s/cmd.r' % (self.work_dir + '/RandomForest'), shell=True)
210             self.logger.info('RandomForest计算成功')
211         except subprocess.CalledProcessError:
212             self.logger.info('RandomForest计算失败')
213             self.set_error('R运行计算RandomForest失败')
214         self.logger.info('运行RandomForest_perl.pl程序进行RandomForest计算完成')
215         allfiles = self.get_filesname()        
216         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[1], 'randomforest_mds_sites.xls')
217         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[2], 'randomforest_proximity_table.xls')
218         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[3], 'randomforest_topx_vimp.xls')
219         self.linkfile(self.work_dir + '/RandomForest/' + allfiles[4], 'randomforest_vimp_table.xls')
220         if self.option('envtable').is_set:
221             if allfiles[0] and allfiles[5] and allfiles[6]:
222                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[0], 'randomforest_confusion_table.xls')
223                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[5], 'randomforest_predicted_answer.xls')
224                 self.linkfile(self.work_dir + '/RandomForest/' + allfiles[6], 'randomforest_votes_probably.xls')
225             else:
226                 self.set_error('按分组计算的文件生成出错')
227         if not self.option('envtable').is_set:
228             self.end()
229         if not (allfiles[5] and allfiles[6]):
230             self.end()
231         cmd = self.config.SOFTWARE_DIR + '/program/perl/perls/perl-5.24.0/bin/perl ' + self.config.SOFTWARE_DIR + '/bioinfo/meta/scripts/calc_roc.pl'
232         cmd += ' -i1 %s' %(self.work_dir + '/RandomForest/randomforest_votes_probably.xls')
233         cmd += ' -i2 %s' %(self.work_dir + '/RandomForest/randomforest_predicted_answer.xls')
234         cmd += ' -o %s' %(self.work_dir + '/ROC/')
235         self.logger.info('开始运行calc_roc.pl计算ROC相关数据')
236         
237         try:
238             subprocess.check_output(cmd, shell=True)
239             self.logger.info('生成 roc_cmd.r 成功')
240         except subprocess.CalledProcessError:
241             self.logger.info('生成 roc_cmd.r 失败')
242             self.set_error('无法生成 roc_cmd.r 文件')
243         try:
244             subprocess.check_output(self.config.SOFTWARE_DIR +
245                                     '/program/R-3.3.1/bin/R --restore --no-save < %s/roc_cmd.r' % (self.work_dir + '/ROC'), shell=True)
246             self.logger.info('ROC计算成功')
247         except subprocess.CalledProcessError:
248             self.logger.info('ROC计算失败')
249             self.set_error('R运行计算ROC失败')
250         self.logger.info('运行calc_roc.pl程序进行ROC计算完成')
251         allfiles = self.get_roc_filesname()
252         self.linkfile(self.work_dir + '/ROC/' + allfiles[0], 'roc_table.xls')
253         self.linkfile(self.work_dir + '/ROC/' + allfiles[1], 'roc_point.xls')
254         self.linkfile(self.work_dir + '/ROC/' + allfiles[2], 'auc.xls')
255         self.end()
256 
257     def linkfile(self, oldfile, newname):
258         """
259         link文件到output文件夹
260         :param oldfile: 资源文件路径
261         :param newname: 新的文件名
262         :return:
263         """
264         newpath = os.path.join(self.output_dir, newname)
265         if os.path.exists(newpath):
266             os.remove(newpath)
267         os.link(oldfile, newpath)
268 
269     def get_roc_filesname(self):
270         filelist = os.listdir(self.work_dir + '/ROC')
271         roc_table_file = None
272         roc_point_file = None
273         auc_file = None
274         for name in filelist:
275             if 'roc_table.xls' in name:
276                 roc_table_file = name
277             elif 'roc_point.xls' in name:
278                 roc_point_file = name
279             elif 'auc.xls' in name:
280                 auc_file = name
281         if (roc_table_file and roc_point_file and auc_file):
282             return [roc_table_file, roc_point_file, auc_file]
283         else:
284             self.set_error("未知原因,ROC计算结果丢失")
285     
286     def get_filesname(self):
287         filelist = os.listdir(self.work_dir + '/RandomForest')
288         randomforest_confusion_table_file = None
289         randomforest_mds_sites_file = None
290         randomforest_proximity_table_file = None
291         randomforest_topx_vimp_file = None
292         randomforest_vimp_table_file = None
293         randomforest_predicted_answer_file = None
294         randomforest_votes_probably_file = None
295         for name in filelist:
296             if 'randomforest_confusion_table.xls' in name:
297                 randomforest_confusion_table_file = name
298             elif 'randomforest_mds_sites.xls' in name:
299                 randomforest_mds_sites_file = name
300             elif 'randomforest_proximity_table.xls' in name:
301                 randomforest_proximity_table_file = name
302             elif 'randomforest_topx_vimp.xls' in name:
303                 randomforest_topx_vimp_file = name
304             elif 'randomforest_vimp_table.xls' in name:
305                 randomforest_vimp_table_file = name
306             elif 'randomforest_predicted_answer.xls' in name:
307                 randomforest_predicted_answer_file = name
308             elif 'randomforest_votes_probably.xls' in name:
309                 randomforest_votes_probably_file = name
310         if (randomforest_mds_sites_file and randomforest_proximity_table_file and 
311             randomforest_topx_vimp_file and randomforest_vimp_table_file):
312             if self.option('envtable').is_set:
313                 if not randomforest_confusion_table_file:
314                     self.set_error('未知原因,样本分组模拟结果丢失或未生成')
315                 if not randomforest_predicted_answer_file:
316                     self.set_error('未知原因,样本分组预测结果文件丢失或未生成')
317                 if not randomforest_votes_probably_file:
318                     self.set_error('未知原因,样本分组预测概率表丢失或未生成')
319             return [randomforest_confusion_table_file, randomforest_mds_sites_file,
320                     randomforest_proximity_table_file, randomforest_topx_vimp_file,
321                     randomforest_vimp_table_file, randomforest_predicted_answer_file,
322                     randomforest_votes_probably_file]
323         else:
324             self.set_error('未知原因,数据计算结果丢失或者未生成')
View Code
  1 #!/mnt/ilustre/users/sanger-dev/app/program/perl/perls/perl-5.24.0/bin/perl
  2 use strict;
  3 use warnings;
  4 use Getopt::Long;
  5 my %opts;
  6 my $VERSION = "V2.20160708";
  7 GetOptions( %opts,"i=s","m=s","o=s","g=s","ntree=i","top=i","type=s");
  8 my $usage = <<"USAGE";
  9        Program : $0   
 10        Discription: Program used to caculate randomforest,with mds plot and importance variables given .  
 11        Version : $VERSION
 12        Contact : jie.yao@majorbio.com
 13        Usage :perl $0 [options]         
 14                         -i      * input otu table file 
 15                         -o      * output dir
 16                         -m      input mapping file if you want set points's color and pch by groups. If omitted, randomForest will run in unsupervised mode.
 17                 Default:none
 18                         -g      group name in mapping file .Default:none
 19                         -ntree    Number of trees to grow. This should not be set to too     small a number, to ensure that every input row gets predicted at least a few times.Default:500    
 20                         -top    How many variables to show? 
 21                         -type     either 1,2 or 3, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity).
 22             
 23        Example:$0 -i otu_table.xls -o randomForest  -m group -g group  
 24 
 25 USAGE
 26 
 27 die $usage if(!($opts{i}&&$opts{o}));
 28 die $usage if($opts{m}&& !$opts{g});
 29 die $usage if(!$opts{m}&& $opts{g});
 30 
 31 $opts{m}=defined $opts{m}?$opts{m}:"none";
 32 $opts{g}=defined $opts{g}?$opts{g}:"none";
 33 $opts{ntree}=defined $opts{ntree}?$opts{ntree}:"500";
 34 $opts{type}=defined $opts{type}?$opts{type}:"1";
 35 $opts{top}=defined $opts{top}?$opts{top}:"50";
 36 
 37 
 38 if(! -e $opts{o}){
 39                 `mkdir $opts{o}`;
 40 }
 41 
 42 
 43 open CMD,">$opts{o}/cmd.r";
 44 print CMD "
 45 library(sp,warn.conflicts = F)
 46 library(randomForest,warn.conflicts = F)
 47 library(maptools,warn.conflicts = F)
 48 basename="randomforest"
 49 
 50 
 51 # if read otu data
 52 otu <-read.table("$opts{i}",sep="\t",head=T,check.names = F)
 53 rownames(otu) <-as.factor(otu[,1])
 54 otu <-otu[,-1]
 55 rownames(otu) <-sapply(rownames(otu),function(x) gsub("_*{.+}"," ",x,perl = TRUE))
 56 rownames(otu) <-sapply(rownames(otu),function(x) gsub("-","_",x,perl = TRUE))
 57 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\[","",x,perl = TRUE))
 58 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\]","",x,perl = TRUE))
 59 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\(","",x,perl = TRUE))
 60 rownames(otu) <-sapply(rownames(otu),function(x) gsub("\\)","",x,perl = TRUE))
 61 rownames(otu) <-sapply(rownames(otu),function(x) gsub("^[0-9]","X\\1",x,perl = TRUE))
 62 rownames(otu) <-sapply(rownames(otu),function(x) gsub("/","",x,perl = TRUE))
 63 otu <-as.data.frame(t(otu),stringsAsFactors=T)
 64 
 65 map="$opts{m}"
 66 if(map !="none"){
 67                 sd <-read.table("$opts{m}",head=T,sep="\t",comment.char = "",check.names = FALSE)        
 68                 rownames(sd) <- as.character(sd[,1])
 69                 sd[,1] <-as.character(sd[,1])
 70                 sd$group <-as.factor(sd$group )
 71                 legend <- as.matrix(unique(sd$group)) 
 72 }
 73 
 74 set.seed(1)
 75 if(map != "none"){
 76 otu.rf <- randomForest(sd$group ~ .,otu,importance=T,proximity=T,ntree=$opts{ntree})
 77 
 78 
 79 
 80 class_count <-as.matrix(table(sd$group))
 81 class <-data.frame(count=class_count)
 82 
 83 ##randomforest votes probably
 84 votes_probably<- paste("$opts{o}/",basename,"_votes_probably.xls",sep="")
 85 write.table(otu.rf$votes,votes_probably,sep="\t",quote=F)
 86 
 87 ##randomforest predicted answer
 88 predicted_answer <- paste("$opts{o}/",basename,"_predicted_answer.xls",sep="")
 89 write.table(otu.rf$predicted,predicted_answer,sep="\t",quote=F)
 90 
 91 ##randomforest classification table
 92 rf_table <- paste("$opts{o}/",basename,"_confusion_table.xls",sep="")
 93 write.table(otu.rf$confusion,rf_table,sep="\t",quote=F)
 94 mds <- cmdscale(1-otu.rf$proximity)  
 95 }else{
 96 otu.rf <- randomForest(otu,importance=T,proximity=T,ntree=$opts{ntree})
 97 mds <- cmdscale(1-otu.rf$proximity)
 98 }
 99 
100 
101 ##mds points
102 mds_points <- paste("$opts{o}/",basename,"_mds_sites.xls",sep="")
103 write.table(mds,mds_points,sep="\t",quote=F)
104 
105 ##proximity table
106 proximity <- paste("$opts{o}/",basename,"_proximity_table.xls",sep="")
107 write.table(otu.rf$proximity,proximity,sep="\t",quote=F)
108 
109 ## importance table
110 vimp_table <- paste("$opts{o}/",basename,"_vimp_table.xls",sep="")
111 write.table(otu.rf$importance,vimp_table,sep="\t",quote=F)
112 
113 
114 ## top importance species table
115 topx_vimp <- paste("$opts{o}/",basename,"_topx_vimp.xls",sep="")
116 imp <- importance(otu.rf)
117 if($opts{type} == 1){
118     top <- imp[order(imp[,"MeanDecreaseAccuracy"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
119     write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F)
120     
121 }else if ($opts{type} == 2){
122     top <- imp[order(imp[,"MeanDecreaseGini"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
123         write.table(t(otu)[rownames(top),],topx_vimp,sep="\t",quote=F)
124 }
125 
126 ";
127 
128 `R --restore --no-save < $opts{o}/cmd.r`;
View Code
原文地址:https://www.cnblogs.com/neverforget/p/5784965.html