python 遍历大文件,处理数据时,时时把变量保存到文件,不增大变量,节省内存

# 多用write()
def split_file(infile, n_parts, outdir): if not os.path.exists(infile): sys.stderr.write("Error: Can't find file: %s\n" % infile) sys.exit(1) fqname, ext = '', '' if infile.endswith(".fastq.gz"): fqname = os.path.basename(infile).split(".fastq.gz")[0] ext = "fastq.gz" elif infile.endswith(".fq.gz"): fqname = os.path.basename(infile).split(".fq.gz")[0] ext = "fq.gz" elif infile.endswith(".fastq"): fqname = os.path.basename(infile).split(".fastq")[0] ext = "fastq" elif infile.endswith(".fq"): fqname = os.path.basename(infile).split(".fq")[0] ext = "fq" else: sys.stderr.write("Error: The input files are not fastq format(*.fq.gz/*.fq/*.fastq.gz/*.fastq)\n") total_read_num, total_base_num = get_file_size(infile) elapsed_time = datetime.now() - START_TIME print "Loaded %s: %d sequences, %d bp, %d seconds elapsed" % (infile, total_read_num, total_base_num, elapsed_time.seconds) print "=> dividing into %d parts:" % n_parts read_num_per_file = total_read_num/n_parts if total_read_num % n_parts == 0 else int(total_read_num/n_parts)+1 num_len = len(str(n_parts)) with gzip.open(infile) if infile.endswith(".gz") else open(infile) as I: for part in range(1, n_parts+1): part_file = "%s.%0*d.%s" % (fqname, num_len, part, ext) out_sub_file = '/'.join([outdir, part_file]) print out_sub_file written = 0 with gzip.open(out_sub_file, "wb") if out_sub_file.endswith(".gz") else open(out_sub_file, "w") as OUT: is_done = False while not is_done and written < read_num_per_file: written += 1 is_done, _, read = get_fastq_read(I) OUT.write("%s\n" % read)# 如这里

  

本文来自博客园,作者:BioinformaticsMaster,转载请注明原文链接:https://www.cnblogs.com/koujiaodahan/p/15762794.html

原文地址:https://www.cnblogs.com/koujiaodahan/p/15762794.html