python 遍历大文件,处理数据时,时时把变量保存到文件,不增大变量,节省内存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | # 多用write()<br>def split_file(infile, n_parts, outdir): if not os.path.exists(infile): sys.stderr.write( "Error: Can't find file: %s\n" % infile) sys.exit( 1 ) fqname, ext = ' ', ' ' if infile.endswith( ".fastq.gz" ): fqname = os.path.basename(infile).split( ".fastq.gz" )[ 0 ] ext = "fastq.gz" elif infile.endswith( ".fq.gz" ): fqname = os.path.basename(infile).split( ".fq.gz" )[ 0 ] ext = "fq.gz" elif infile.endswith( ".fastq" ): fqname = os.path.basename(infile).split( ".fastq" )[ 0 ] ext = "fastq" elif infile.endswith( ".fq" ): fqname = os.path.basename(infile).split( ".fq" )[ 0 ] ext = "fq" else : sys.stderr.write( "Error: The input files are not fastq format(*.fq.gz/*.fq/*.fastq.gz/*.fastq)\n" ) total_read_num, total_base_num = get_file_size(infile) elapsed_time = datetime.now() - START_TIME print "Loaded %s: %d sequences, %d bp, %d seconds elapsed" % (infile, total_read_num, total_base_num, elapsed_time.seconds) print "=> dividing into %d parts:" % n_parts read_num_per_file = total_read_num / n_parts if total_read_num % n_parts = = 0 else int (total_read_num / n_parts) + 1 num_len = len ( str (n_parts)) with gzip. open (infile) if infile.endswith( ".gz" ) else open (infile) as I: for part in range ( 1 , n_parts + 1 ): part_file = "%s.%0*d.%s" % (fqname, num_len, part, ext) out_sub_file = '/' .join([outdir, part_file]) print out_sub_file written = 0 with gzip. open (out_sub_file, "wb" ) if out_sub_file.endswith( ".gz" ) else open (out_sub_file, "w" ) as OUT: is_done = False while not is_done and written < read_num_per_file: written + = 1 is_done, _, read = get_fastq_read(I) OUT.write( "%s\n" % read) # 如这里 |
本文来自博客园,作者:BioinformaticsMaster,转载请注明原文链接:https://www.cnblogs.com/koujiaodahan/p/15762794.html
分类:
python
posted on 2022-01-04 15:53 BioinformaticsMaster 阅读(145) 评论(0) 编辑 收藏 举报
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律