.NET实现 数据提取、转换和加载
早晨看到一篇文章:
http://www.codeproject.com/Articles/34556/Write-ETL-jobs-in-pure-C
有二个文件:User name file(ID NAME)、Addresses file(ID ADDRESS):
Id Name
1 Bob
Id Address 1 123 Main St. 2 42 Everywhich way
如果想得到
Id Name Address 如果在SQL中一个查询 就可得到。 在文件处理的过程中,基于c# 如何处理呢? 代码中给出了解决方案。
首先把二个文件的内容读取:
FileEngine是基于 FileHelpers
public class UserNameRead : AbstractOperation { public UserNameRead(string filePath) { this.filePath = filePath; } string filePath = null; public override IEnumerable<Row> Execute(IEnumerable<Row> rows) { using (FileEngine file = FluentFile.For<UserNameRecord>().From(filePath)) { foreach (object obj in file) { yield return Row.FromObject(obj); } } } } public class UserAddressRead : AbstractOperation { public UserAddressRead(string filePath) { this.filePath = filePath; } string filePath = null; public override IEnumerable<Row> Execute(IEnumerable<Row> rows) { using (FileEngine file = FluentFile.For<UserAddressRecord>().From(filePath)) { foreach (object obj in file) { yield return Row.FromObject(obj); } } } }
创建二个文件的关系并构造新的文件
public class JoinUserRecords : JoinOperation { protected override void SetupJoinConditions() { InnerJoin .Left("Id") .Right("Id"); } protected override Row MergeRows(Row leftRow, Row rightRow) { Row row = new Row(); row.Copy(leftRow); //copy over all properties not in the user records row["Address"] = rightRow["Address"]; return row; } }
创建好的结构 如何输出:
public class UserFullWrite : AbstractOperation { public UserFullWrite(string filePath) { this.filePath = filePath; } string filePath = null; public override IEnumerable<Row> Execute(IEnumerable<Row> rows) { FluentFile engine = FluentFile.For<UserFullRecord>(); engine.HeaderText = "Id\tName\tAddress"; using (FileEngine file = engine.To(filePath)) { foreach (Row row in rows) { file.Write(row.ToObject<UserFullRecord>()); //pass through rows if needed for another later operation yield return row; } } } }
调用方法:
public class MainProcess : EtlProcess { protected override void Initialize() { Register(new JoinUserRecords() .Left(new UserNameRead(Settings.Default.NamesFile)) .Right(new UserAddressRead(Settings.Default.AddressesFile)) ); Register(new UserFullWrite(Settings.Default.OutputFile)); } }
总结: 对于结构化的文件 , 通常比较好处理,但是对于非结构化的文件处理,不易处理。