.NET实现 数据提取、转换和加载

早晨看到一篇文章:

http://www.codeproject.com/Articles/34556/Write-ETL-jobs-in-pure-C

 

有二个文件:User name file(ID NAME)、Addresses file(ID ADDRESS):

Id Name
1  Bob

Id Address
1 123 Main St.
2 42 Everywhich way
如果想得到
Id Name Address 如果在SQL中一个查询 就可得到。 在文件处理的过程中,基于c# 如何处理呢? 代码中给出了解决方案。

首先把二个文件的内容读取:
FileEngine是基于 FileHelpers 
public class UserNameRead : AbstractOperation
{
    public UserNameRead(string filePath)
    {
        this.filePath = filePath;
    }

    string filePath = null;

    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
    {
        using (FileEngine file = FluentFile.For<UserNameRecord>().From(filePath))
        {
            foreach (object obj in file)
            {
                yield return Row.FromObject(obj);
            }
        }
    }
}

public class UserAddressRead : AbstractOperation
{
    public UserAddressRead(string filePath)
    {
        this.filePath = filePath;
    }

    string filePath = null;

    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
    {
        using (FileEngine file = FluentFile.For<UserAddressRecord>().From(filePath))
        {
            foreach (object obj in file)
            {
                yield return Row.FromObject(obj);
            }
        }
    }
}

创建二个文件的关系并构造新的文件
public class JoinUserRecords : JoinOperation
{
    protected override void SetupJoinConditions()
    {
        InnerJoin
            .Left("Id")
            .Right("Id");
    }

    protected override Row MergeRows(Row leftRow, Row rightRow)
    {
        Row row = new Row();
        row.Copy(leftRow);

        //copy over all properties not in the user records
        row["Address"] = rightRow["Address"];

        return row;
    }
}


创建好的结构 如何输出:
public class UserFullWrite : AbstractOperation
{
    public UserFullWrite(string filePath)
    {
        this.filePath = filePath;
    }

    string filePath = null;

    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)
    {
        FluentFile engine = FluentFile.For<UserFullRecord>();
        engine.HeaderText = "Id\tName\tAddress";
        using (FileEngine file = engine.To(filePath))
        {
            foreach (Row row in rows)
            {
                file.Write(row.ToObject<UserFullRecord>());

                //pass through rows if needed for another later operation 
                yield return row;
            }
        }
    }
}

调用方法:
public class MainProcess : EtlProcess
{
    protected override void Initialize()
    {
        Register(new JoinUserRecords()
            .Left(new UserNameRead(Settings.Default.NamesFile))
            .Right(new UserAddressRead(Settings.Default.AddressesFile))
        );

        Register(new UserFullWrite(Settings.Default.OutputFile));
    }
}

总结: 对于结构化的文件 , 通常比较好处理,但是对于非结构化的文件处理,不易处理。

posted @ 2012-12-08 10:37  smodi  阅读(342)  评论(0编辑  收藏  举报