大数据高级筛选与合并--C++实现与Python实现
需求描述:
从大量相同格式的源文件中筛选出符合同一个条件的数据,整合到一张工作表中。
具体源文件格式包括.csv,.txt,拥有相同的前缀名。
如下所示,.txt源文件中的目标数据包含非数字元素,.csv的同类数据存放在单元格中,因此需要判断数据类型:
.txt:
.csv:
目标数据可能包含非数字其他元素。
python脚本实现方式:
判断非数字函数:
1 def is_number(s): 2 try: 3 float(s) 4 return True 5 except ValueError: 6 pass 7 8 try: 9 import unicodedata 10 unicodedata.numeric(s) 11 return True 12 except (TypeError, ValueError): 13 pass 14 15 return False
遍历目标路径文件列表,对文件后缀正则分类:
1 from pathlib import Path 2 folder_path=r'D:\visual studio\Code\test' 3 path=Path(folder_path) 4 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) ]
对于单一文件类型列表可以在listdir后加endswith限制条件:
1 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.join(folder_path, file).endswith('.*')]
对txt_file_list中的对象进行遍历,并提取后缀:
1 OriginName=os.path.split(txt_file)[1] 2 TransName=OriginName.split('.') 3 L_Trans=len(TransName) 4 TypeName=TransName[L_Trans-1] 5 print(OriginName) 6 print(TypeName) 7 8 work_book=xlwt.Workbook(encoding='utf-8') #统一编码方式,否则后续可能无法识别 9 sheet=work_book.add_sheet('temp_'+TransName[0]) #临时sheet命名
对.txt和.csv分别进行处理,统一生成temp_name.csv临时文件:
1 if TypeName=='txt': 2 workbook=xlwt.Workbook() 3 sheet1=workbook.add_sheet('name_'+TransName[0],cell_overwrite_ok=True) 4 row0=[u"Gene.refGene",u"gnomAD_genome_ALL"] #筛选目标列 5 for i in range(0,len(row0)): 6 sheet1.write(0,i,row0[i]) 7 8 index=1 9 f=open(txt_file) 10 for line in f: 11 data=line.strip('\n').split('\t') 12 sheet1.write(index,0,data[6]) 13 sheet1.write(index,1,data[38]) 14 index=index+1 15 csv_file_name=TransName[0]+'.csv' 16 workbook.save(path.joinpath(csv_file_name)) 17 sheet=pd.read_excel(path.joinpath(csv_file_name),sheet_name=0) 18 os.remove(os.path.join(path,csv_file_name)) 19 elif TypeName=='csv': 20 sheet=pd.read_excel(path.joinpath(OriginName),sheet_name=0) 21 else: 22 ... # continue
继续使用loc函数和lambda表达式进一步按条件筛选出DataFrame:
1 DataFrame1 = sheet.copy() 2 DataFrame1 = DataFrame1[['Gene.refGene', 'gnomAD_genome_ALL']] 3 DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:is_number(x))] 4 DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:0<=float(x)<=0.05)] 5 DataTemp=DataFrame1.reset_index(drop=True) #忽略表头,重置索引 6 # os.remove(os.path.join(path,csv_file_name)) #删除临时文件
进行一些后续的频数统计之后使用concat或merge聚合DataFrame:
1 DataTemp=pd.concat([GenNameRes,freq],axis=1,join_axes=[freq.index]) 2 Res=pd.concat([Res,DataTemp],axis=1, join='outer')
数据拼接方法详见:
python数据拼接: pd.concat - boobo - 博客园 (cnblogs.com)
最后生成结果表格:
1 Res.to_excel(path.joinpath('result.xlsx'),index=False,encoding='utf-8')
完整代码:
1 import xlwt 2 import csv 3 import os 4 import pandas as pd 5 from pandas.core.frame import DataFrame 6 import numpy as np 7 from pathlib import Path 8 9 def is_number(s): 10 try: 11 float(s) 12 return True 13 except ValueError: 14 pass 15 16 try: 17 import unicodedata 18 unicodedata.numeric(s) 19 return True 20 except (TypeError, ValueError): 21 pass 22 23 return False 24 25 Res=DataFrame([]) 26 27 folder_path=r'D:\visual studio\Code\test' 28 path=Path(folder_path) 29 txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) ] 30 31 for txt_file in txt_file_list: 32 OriginName=os.path.split(txt_file)[1] 33 TransName=OriginName.split('.') 34 L_Trans=len(TransName) 35 TypeName=TransName[L_Trans-1] 36 print(OriginName) 37 print(TypeName) 38 39 work_book=xlwt.Workbook(encoding='utf-8') 40 sheet=work_book.add_sheet('temp_'+TransName[0]) 41 42 if TypeName=='txt': 43 workbook=xlwt.Workbook() 44 sheet1=workbook.add_sheet('name_'+TransName[0],cell_overwrite_ok=True) 45 row0=[u"Gene.refGene",u"gnomAD_genome_ALL"] 46 for i in range(0,len(row0)): 47 sheet1.write(0,i,row0[i]) 48 49 index=1 50 f=open(txt_file) 51 for line in f: 52 data=line.strip('\n').split('\t') 53 sheet1.write(index,0,data[6]) 54 sheet1.write(index,1,data[38]) 55 index=index+1 56 csv_file_name=TransName[0]+'.csv' 57 workbook.save(path.joinpath(csv_file_name)) 58 sheet=pd.read_excel(path.joinpath(csv_file_name),sheet_name=0) 59 os.remove(os.path.join(path,csv_file_name)) 60 elif TypeName=='csv': 61 sheet=pd.read_excel(path.joinpath(OriginName),sheet_name=0) 62 else: 63 continue 64 65 DataFrame1 = sheet.copy() 66 DataFrame1 = DataFrame1[['Gene.refGene', 'gnomAD_genome_ALL']] 67 DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:is_number(x))] 68 DataFrame1=DataFrame1.loc[DataFrame1['gnomAD_genome_ALL'].apply(lambda x:0<=float(x)<=0.05)] 69 DataTemp=DataFrame1.reset_index(drop=True) 70 # os.remove(os.path.join(path,csv_file_name)) 71 72 freq=[] 73 GenNameRes=[] 74 colName=DataFrame1['Gene.refGene'] 75 colName=colName.reset_index(drop=True) 76 77 dict={} 78 for key in colName: 79 dict[key]=dict.get(key, 0)+1 80 81 for key in colName: 82 if key not in GenNameRes: 83 GenNameRes.append(key) 84 freq.append(str(dict[key])) 85 else: 86 continue 87 88 GenNameRes=DataFrame(GenNameRes) 89 freq=DataFrame(freq) 90 GenNameRes.columns=["Gene.refGene_"+TransName[0]] 91 freq.columns=["Frequency_"+TransName[0]] 92 93 DataTemp=pd.concat([GenNameRes,freq],axis=1,join_axes=[freq.index]) 94 Res=pd.concat([Res,DataTemp],axis=1, join='outer') 95 Res.to_excel(path.joinpath('result.xlsx'),index=False,encoding='utf-8') 96 Res
C++实现:
#include <iostream> #include <string> #include <fstream> #include <sstream> #include <iomanip> #include <vector> #include <cstring> #include <algorithm> #include <unordered_map> using namespace std; struct GeneData{ string GeneRef; float gnom[8]; int Times; }; struct Testgen{ int gen1; int gen2; string genref; }; int main(){ ofstream outfile; for(int i=1;i<=3;i++){ string path="./Test"; path+=('0'+i); path+=".csv"; cout<<path<<endl; //string path="./Test.csv"; ifstream ifs; ifs.open(path,ios::in); if(!ifs.is_open()){ cout<<"open failed"; system("pause"); } string line; vector<GeneData>GenVec;//1 vector<Testgen>TestgenVec;//2 vector<string>Vstr; unordered_map<string,int>M; int CNT=1; getline(ifs,line); while(getline(ifs,line)){ if(CNT==1){ CNT++; continue; } //cout<<line<<endl; stringstream ss(line); string str; GeneData Gen; Testgen TestGen; bool flag=true; int cnt=1; // while(cnt<=190){ // getline(ss,str,','); // if(cnt==7){ // Gen.GeneRef=stod(str); // } // if(cnt<=46 || cnt>=39){ // Gen.gnom[cnt-39]=stof(str); // if(stof(str)>0.05){ // flag=false; // } // } // cnt++; // } // if(flag==true){ // GenVec.push_back(Gen); // } while(cnt<=4){ getline(ss,str,','); //.txt文件为getline(ss,str,' ') if(cnt==2){ TestGen.gen1=stoi(str); if(TestGen.gen1<0)flag=false; } if(cnt==3){ TestGen.gen2=stoi(str); if(TestGen.gen2<0)flag=false; } if(cnt==4)TestGen.genref=str; cnt++; } if(flag==true){ TestgenVec.push_back(TestGen); } CNT++; } for(auto x:TestgenVec){ M[x.genref]++; } string res_path="./Res"; res_path+=('0'+i); res_path+=".csv"; cout<<res_path<<endl; if(i==1){ outfile.open("./Res.csv",ios::out); } outfile<<"chart"<<i<<endl; outfile<<"genref"<<','<<"Frequency"<<endl;//<<"gen1"<<','<<"gen2"<<',' for(auto x:TestgenVec){ outfile<<x.genref<<','<<M[x.genref]<<endl; }//<<x.gen1<<','<<x.gen2<<',' // for(auto x:Vstr){ // outfile<<x<<endl; // cout<<x<<endl; // } if(i==3){ outfile.close(); } } //ifstream infile("./Test.csv",ios::in); // for(auto x:GenVec){ // x.Times=M[x.GeneRef]; // cout<<x.Times<<endl; // } system("pause"); return 0; }
非注释代码为测试代码,测试用csv文件仅使用简化数据类型:
运行结果为:
C++实现方法有待优化文件编码方式;
其他实现方式请参考:
C++实现读取CSV文件数据将进行计算。_Liuxm-CSDN博客_c++读取csv文件
【C++】读取 .csv / .xlsx 文件中的指定数据(非常实用)_小朋友-CSDN博客_c++读取csv文件的某一列数据