spark kmer计算
- 输入文件:fa格式的文件
- 输出结果:kmer的频数和对应的kmer类型和计数
1.将fq.gz的文件转换成fa文件:
#!/usr/bin/python env # -*- coding:utf-8 -*- import os import re import os.path import gzip import sys #在这里可以写一个函数用来将文件转换成id和序列对应的字典 #需要用到哪个转化操作呢?考虑先尝试使用filter或者map ''' @r261DFRAAXX100204:1:100:10494:3070/1 ACTGCATCCTGGAAAGAATCAATGGTGGCCGGAAAGTGTTTTTCAAATACAAGAGTGACAATGTGCCCTGTTGTTT + ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCBC?CCCCCCCCC@@CACCCCCACCCCCCCCCCCCCCCCCCCCCCCC ''' #这里是利用python直接读取压缩的fastq文件 def read_gz_file(path): if os.path.exists(path): with gzip.open(path,'rt') as pf: for line in pf: yield line else: print 'the path [{}] is not exist!'.format(path) def ReadFastq(fastq): flag = 1 dict_fq={} if fastq.endswith('gz'): con = read_gz_file(fastq) if getattr(con,'__iter__','None'): for line in con: line=line.strip() flag_index = flag%4 if flag_index == 1: id = line if flag%4 == 2: seq = line else: flag +=1 continue dict_fq[id] = seq flag+=1 return dict_fq else: with open (fastq,'r') as fqr: for line in fqr.readlines(): line = line.strip() flag_index = flag%4 if flag_index == 1: id = line if flag%4 == 2: seq = line else: flag +=1 continue dict_fq[id] = seq flag+=1 return dict_fq def convert_to_fa(dict_hash,output): with open (output,'w') as fr: for i in dict_hash.keys(): fr.write(i+'\n') fr.write(dict_hash[i]+'\n') if __name__ == '__main__': input = sys.argv[1] output = sys.argv[2] dic_fa = ReadFastq(input) convert_to_fa(dic_fa,output)
2.将reads打断成kmer并统计kmer的频数
#!/usr/bin/env python # coding=utf-8 import os import sys import re from pyspark import SparkConf, SparkContext input_fasta_file ='/home/yueyao/Spark/00.data/both.fa' conf = SparkConf().setMaster("local").setAppName("Yue Yao app") sc = SparkContext(conf = conf) fasta_file = sc.textFile(input_fasta_file) #这里是对fasta文件进行转化操作,过滤掉reads的名称 reads_fa = fasta_file.filter(lambda line :">" not in line) #这个函数用来将reads打断成kmer,这里的kmer是25,返回一个列表 def map_file(line): seq_lis=[] for i in range(len(line)-25+1): sub_seq = line[i:i+25] seq_lis.append(sub_seq) return seq_lis kmer_list = reads_fa.flatMap(map_file) #对打断的kmer进行计数 kmer_count = kmer_list.map(lambda id:(id,1)) kmer_total_count = kmer_count.reduceByKey(lambda a,b:(a+b)) #这里过滤掉了含有N的kmer kmer_not_contain_N = kmer_total_count.filter(lambda line :"N" not in line[0]) kmer_key=kmer_not_contain_N.keys() #统计kmer的种类,并计数 kmer_vari_count = kmer_not_contain_N.map(lambda kmer_vari:(kmer_vari[1],1)) kmer_histo = kmer_vari_count.reduceByKey(lambda a,b:(a+b)) #输出kmer频数的结果 kmer_histo.saveAsTextFile('Kmer25.histo') kmer_not_contain_N.saveAsTextFile('kmer25') kmer_key.saveAsTextFile('kmer25_key')