给记录排序并去除重复记录

目录

一、流程图

二、 代码 

排序记录

去除重复记录

三、运行

完整代码

ktr文件


一、流程图

        如下图,本次重点是排序记录和去除重复记录两个节点说明。

         排序记录界面配置如下,一般只需要填写字段列信息即可,其他信息使用默认的就行,

         去除重复记录也是填写下面比较的字段即可,这里会比较id,name,age三个内容的值,如果一致那么就会去掉一条,当然也可以仅仅指定id,如果是相同的id那么也意味着相同记录。

二、 代码 

        怎么去根据ktr来写代码看前面的写的博客,这里就仅仅贴一下代码了,仅供参考。

排序记录

/**
     * 排序记录
     * @param transMeta
     * @param registry
     * @return
     */
    private StepMeta getSortRowsStep(TransMeta transMeta, PluginRegistry registry){
        SortRowsMeta sortRowsMeta = new SortRowsMeta();

        sortRowsMeta.setDirectory("%%java.io.tmpdir%%");//<directory>%%java.io.tmpdir%%</directory>
        sortRowsMeta.setPrefix("out");//<prefix>out</prefix>
        sortRowsMeta.setSortSize("1000000");//<sort_size>1000000</sort_size>
        sortRowsMeta.setCompressFiles(false);//<compress>N</compress>
        //这里也可以看出,sortRowsMeta这个组件好像也可以只允许通过唯一的对象
        sortRowsMeta.setOnlyPassingUniqueRows(false);//<unique_rows>N</unique_rows>

        /*
         <field>
            <name>id</name>
            <ascending>Y</ascending>
            <case_sensitive>N</case_sensitive>
            <collator_enabled>N</collator_enabled>
            <collator_strength>0</collator_strength>
            <presorted>N</presorted>
          </field>
         */
        //排序字段
        sortRowsMeta.setFieldName(new String[]{"id","name","age"});
        //是否升序, true 升序
        sortRowsMeta.setAscending(new boolean[]{true,true,true});
        //是否大小写敏感  false 忽略大小写
        sortRowsMeta.setCaseSensitive(new boolean[]{false,false,false});
        sortRowsMeta.setCollatorEnabled(new boolean[]{false,false,false});
        sortRowsMeta.setCollatorStrength(new int[]{0,0,0});
        sortRowsMeta.setPreSortedField(new boolean[]{false,false,false});

        String sortRowsPluginId = registry.getPluginId(StepPluginType.class, sortRowsMeta);
        StepMeta sortRowsStep = new StepMeta(sortRowsPluginId, "排序记录", (StepMetaInterface) sortRowsMeta);

        sortRowsStep.setDraw(true);
        sortRowsStep.setLocation(448,384);

        transMeta.addStep(sortRowsStep);

        return sortRowsStep;
    }

去除重复记录

/**
     * 去除重复记录
     * @param transMeta
     * @param registry
     * @return
     */
    private StepMeta getUniqueStep(TransMeta transMeta, PluginRegistry registry){
        UniqueRowsMeta uniqueRowsMeta = new UniqueRowsMeta();

        uniqueRowsMeta.setCountRows(false);// <count_rows>N</count_rows>
        uniqueRowsMeta.setRejectDuplicateRow(false);//<reject_duplicate_row>N</reject_duplicate_row>

        uniqueRowsMeta.setCompareFields(new String[]{"id","name","age"});
        uniqueRowsMeta.setCaseInsensitive(new boolean[]{false,false,false});

        String uniqueRowsPluginId = registry.getPluginId(StepPluginType.class, uniqueRowsMeta);
        StepMeta uniqueRowsStep = new StepMeta(uniqueRowsPluginId, "去除重复记录", (StepMetaInterface) uniqueRowsMeta);

        uniqueRowsStep.setDraw(true);
        uniqueRowsStep.setLocation(560,384);

        transMeta.addStep(uniqueRowsStep);

        return  uniqueRowsStep;
    }

三、运行

 ​​​​​​​运行后结果如下图所示,排序了,并去除了重复记录,而且一次性通过,没有任何问题项。

完整代码

@Before
    public void before() {
        try {
            //初始化环境
            EnvUtil.environmentInit();
            KettleEnvironment.init();
        } catch (KettleException e) {
            log.error("", e);
        }
    }

    /**
     * 去除重复记录前首先要排序,排序完才可以。
     * 排序记录+去除重复记录 = 唯一行(哈希值)
     * 唯一行(哈希值) 是将每一行都计算一个哈希值,然后比较行hash是否相同,相同则去除;
     * 排序记录+去除重复记录  则是比较排序后相邻的记录是否相同,相同则去除;
     * 理论上来说  hash值这个比较快。
     */
    @Test
    public void exchangeWithSortAndUnique() throws KettleException {

        TransMeta transMeta = new TransMeta();
        transMeta.setName("去除重复记录");

        PluginRegistry registry = PluginRegistry.getInstance();

        StepMeta inputStep = getInputStep(transMeta,registry);
        StepMeta sortStep = getSortRowsStep(transMeta,registry);
        StepMeta uniqueStep = getUniqueStep(transMeta,registry);
        StepMeta outStep = getOutputStep(transMeta,registry);



             /*
        4. 关联步骤
         */
        transMeta.addTransHop(new TransHopMeta(inputStep, sortStep));
        transMeta.addTransHop(new TransHopMeta(sortStep, uniqueStep));
        transMeta.addTransHop(new TransHopMeta(uniqueStep, outStep));


        /*
        5.执行
         */
        Trans trans = new Trans(transMeta);


        //执行转换
        trans.execute(null);

        //等待完成
        trans.waitUntilFinished();
        if (trans.getErrors() > 0) {
            System.out.println("交换出错.");
            return;
        }

    }

    /**
     * 去除重复记录
     * @param transMeta
     * @param registry
     * @return
     */
    private StepMeta getUniqueStep(TransMeta transMeta, PluginRegistry registry){
        UniqueRowsMeta uniqueRowsMeta = new UniqueRowsMeta();

        uniqueRowsMeta.setCountRows(false);// <count_rows>N</count_rows>
        uniqueRowsMeta.setRejectDuplicateRow(false);//<reject_duplicate_row>N</reject_duplicate_row>

        uniqueRowsMeta.setCompareFields(new String[]{"id","name","age"});
        uniqueRowsMeta.setCaseInsensitive(new boolean[]{false,false,false});

        String uniqueRowsPluginId = registry.getPluginId(StepPluginType.class, uniqueRowsMeta);
        StepMeta uniqueRowsStep = new StepMeta(uniqueRowsPluginId, "去除重复记录", (StepMetaInterface) uniqueRowsMeta);

        uniqueRowsStep.setDraw(true);
        uniqueRowsStep.setLocation(560,384);

        transMeta.addStep(uniqueRowsStep);

        return  uniqueRowsStep;
    }

    /**
     * 排序记录
     * @param transMeta
     * @param registry
     * @return
     */
    private StepMeta getSortRowsStep(TransMeta transMeta, PluginRegistry registry){
        SortRowsMeta sortRowsMeta = new SortRowsMeta();

        sortRowsMeta.setDirectory("%%java.io.tmpdir%%");//<directory>%%java.io.tmpdir%%</directory>
        sortRowsMeta.setPrefix("out");//<prefix>out</prefix>
        sortRowsMeta.setSortSize("1000000");//<sort_size>1000000</sort_size>
        sortRowsMeta.setCompressFiles(false);//<compress>N</compress>
        //这里也可以看出,sortRowsMeta这个组件好像也可以只允许通过唯一的对象
        sortRowsMeta.setOnlyPassingUniqueRows(false);//<unique_rows>N</unique_rows>

        /*
         <field>
            <name>id</name>
            <ascending>Y</ascending>
            <case_sensitive>N</case_sensitive>
            <collator_enabled>N</collator_enabled>
            <collator_strength>0</collator_strength>
            <presorted>N</presorted>
          </field>
         */
        //排序字段
        sortRowsMeta.setFieldName(new String[]{"id","name","age"});
        //是否升序, true 升序
        sortRowsMeta.setAscending(new boolean[]{true,true,true});
        //是否大小写敏感  false 忽略大小写
        sortRowsMeta.setCaseSensitive(new boolean[]{false,false,false});
        sortRowsMeta.setCollatorEnabled(new boolean[]{false,false,false});
        sortRowsMeta.setCollatorStrength(new int[]{0,0,0});
        sortRowsMeta.setPreSortedField(new boolean[]{false,false,false});

        String sortRowsPluginId = registry.getPluginId(StepPluginType.class, sortRowsMeta);
        StepMeta sortRowsStep = new StepMeta(sortRowsPluginId, "排序记录", (StepMetaInterface) sortRowsMeta);

        sortRowsStep.setDraw(true);
        sortRowsStep.setLocation(448,384);

        transMeta.addStep(sortRowsStep);

        return sortRowsStep;
    }

    /**
     * 获取输入
     *
     * @return
     */
    private StepMeta getInputStep(TransMeta transMeta, PluginRegistry registry) {
        ExcelInputMeta inputMeta = new ExcelInputMeta(); // <type>ExcelInput</type>

        //文件路径
        String filePath = "F:\\kette_test\\input\\去除重复记录.xlsx";
        String[] fileName = new String[]{filePath};
        inputMeta.setFileName(fileName); // <name>F:\kette_test\input\去除重复记录.xlsx</name>

        String[] fileMasks = new String[1];
        inputMeta.setFileMask(fileMasks); //  <filemask/>

        String[] fileExcludeMasks = new String[1];
        inputMeta.setExcludeFileMask(fileExcludeMasks); // <exclude_filemask/>

        String[] filerequireds = new String[]{"N"};
        inputMeta.setFileRequired(filerequireds); //  <file_required>N</file_required>

        String[] subFolders = new String[]{"N"};
        inputMeta.setIncludeSubFolders(subFolders); // <include_subfolders>N</include_subfolders>

        inputMeta.setSpreadSheetType(SpreadSheetType.POI); // <spreadsheet_type>POI</spreadsheet_type>

        /*
        8.XXX 版本如果excel中有头部,那么需要设置,5.XXX版本不需要
         */
        inputMeta.setStartsWithHeader(true);

        //第二行开始
        int[] startRow = new int[]{1};
        inputMeta.setStartRow(startRow);

        //第一列开始
        int[] startColumn = new int[]{0};
        inputMeta.setStartColumn(startColumn);

        //字段列
        String[] fieldsName = new String[]{"id", "name", "age"};
        int[] fieldsType = new int[]{ValueMetaInterface.TYPE_NUMBER, ValueMetaInterface.TYPE_STRING, ValueMetaInterface.TYPE_NUMBER};

        //Excel输入 step下的 <fields> .... </fields>
        ExcelInputField[] excelInputFields = new ExcelInputField[fieldsName.length];

        for (int i = 0; i < excelInputFields.length; i++) {
            excelInputFields[i] = new ExcelInputField();
            excelInputFields[i].setName(fieldsName[i]);
            excelInputFields[i].setType(fieldsType[i]);
            excelInputFields[i].setTrimType(ExcelInputMeta.TYPE_TRIM_NONE);
            excelInputFields[i].setRepeated(false);
        }
        inputMeta.setField(excelInputFields);


        /**
         * 2.添加步骤到交换中
         */
        String inputPluginId = registry.getPluginId(StepPluginType.class, inputMeta);
        StepMeta inputStep = new StepMeta(inputPluginId, "Excel输入", (StepMetaInterface) inputMeta); //<step> --> <name>Excel输入</name>

        inputStep.setDraw(true);
        inputStep.setLocation(336,256);

        transMeta.addStep(inputStep);

        return inputStep;
    }

    /**
     * 获取输出
     *
     * @return
     */
    private StepMeta getOutputStep(TransMeta transMeta, PluginRegistry registry) {
        ExcelOutputMeta outputMeta = new ExcelOutputMeta(); // <type>ExcelOutput</type>
        outputMeta.setAppend(false); // <append>N</append>
        outputMeta.setHeaderEnabled(true); // <header>Y</header>
        outputMeta.setFooterEnabled(false);// <footer>N</footer>

        //换个名字用于区分spoon运行的输出文件
        outputMeta.setFileName("F:\\kette_test\\output\\去除重复记录2.xls"); // <name>F:\kette_test\output\字符串剪切替换操作</name>
        outputMeta.setDoNotOpenNewFileInit(false); // <do_not_open_newfile_init>N</do_not_open_newfile_init>
        outputMeta.setCreateParentFolder(false); // <create_parent_folder>N</create_parent_folder>

        //字段列
        String[] fieldsName = new String[]{"id", "name", "age"};
        int[] fieldsType = new int[]{ValueMetaInterface.TYPE_NUMBER, ValueMetaInterface.TYPE_STRING, ValueMetaInterface.TYPE_NUMBER};

        // <fields> ..... </fields>
        ExcelField[] excelFields = new ExcelField[fieldsName.length];

        for (int i = 0; i < excelFields.length; i++) {
            excelFields[i] = new ExcelField();
            excelFields[i].setName(fieldsName[i]);
            excelFields[i].setType(fieldsType[i]);

            /*
             <field>
        <name>id</name>
        <type>Number</type>
        <format>0</format>
      </field>
             */
            if(fieldsName[i].equals("id")){
                excelFields[i].setFormat("0"); // <format>0</format>
            }
            if(fieldsName[i].equals("age")){
                excelFields[i].setFormat("0"); // <format>0</format>
            }
        }
        outputMeta.setOutputFields(excelFields);

        String outPluginId = registry.getPluginId(StepPluginType.class, outputMeta);
        StepMeta outputStep = new StepMeta(outPluginId, "Excel输出", (StepMetaInterface) outputMeta);// <step> --> <name>Excel输出</name>

        outputStep.setDraw(true);
        outputStep.setLocation(704,272);

        transMeta.addStep(outputStep);


        return outputStep;
    }

ktr文件

<?xml version="1.0" encoding="UTF-8"?>
<transformation>
  <info>
    <name>去除重复记录</name>
    <description/>
    <extended_description/>
    <trans_version/>
    <trans_type>Normal</trans_type>
    <directory>/</directory>
    <parameters>
    </parameters>
    <log>
      <trans-log-table>
        <connection/>
        <schema/>
        <table/>
        <size_limit_lines/>
        <interval/>
        <timeout_days/>
        <field>
          <id>ID_BATCH</id>
          <enabled>Y</enabled>
          <name>ID_BATCH</name>
        </field>
        <field>
          <id>CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>CHANNEL_ID</name>
        </field>
        <field>
          <id>TRANSNAME</id>
          <enabled>Y</enabled>
          <name>TRANSNAME</name>
        </field>
        <field>
          <id>STATUS</id>
          <enabled>Y</enabled>
          <name>STATUS</name>
        </field>
        <field>
          <id>LINES_READ</id>
          <enabled>Y</enabled>
          <name>LINES_READ</name>
          <subject/>
        </field>
        <field>
          <id>LINES_WRITTEN</id>
          <enabled>Y</enabled>
          <name>LINES_WRITTEN</name>
          <subject/>
        </field>
        <field>
          <id>LINES_UPDATED</id>
          <enabled>Y</enabled>
          <name>LINES_UPDATED</name>
          <subject/>
        </field>
        <field>
          <id>LINES_INPUT</id>
          <enabled>Y</enabled>
          <name>LINES_INPUT</name>
          <subject/>
        </field>
        <field>
          <id>LINES_OUTPUT</id>
          <enabled>Y</enabled>
          <name>LINES_OUTPUT</name>
          <subject/>
        </field>
        <field>
          <id>LINES_REJECTED</id>
          <enabled>Y</enabled>
          <name>LINES_REJECTED</name>
          <subject/>
        </field>
        <field>
          <id>ERRORS</id>
          <enabled>Y</enabled>
          <name>ERRORS</name>
        </field>
        <field>
          <id>STARTDATE</id>
          <enabled>Y</enabled>
          <name>STARTDATE</name>
        </field>
        <field>
          <id>ENDDATE</id>
          <enabled>Y</enabled>
          <name>ENDDATE</name>
        </field>
        <field>
          <id>LOGDATE</id>
          <enabled>Y</enabled>
          <name>LOGDATE</name>
        </field>
        <field>
          <id>DEPDATE</id>
          <enabled>Y</enabled>
          <name>DEPDATE</name>
        </field>
        <field>
          <id>REPLAYDATE</id>
          <enabled>Y</enabled>
          <name>REPLAYDATE</name>
        </field>
        <field>
          <id>LOG_FIELD</id>
          <enabled>Y</enabled>
          <name>LOG_FIELD</name>
        </field>
        <field>
          <id>EXECUTING_SERVER</id>
          <enabled>N</enabled>
          <name>EXECUTING_SERVER</name>
        </field>
        <field>
          <id>EXECUTING_USER</id>
          <enabled>N</enabled>
          <name>EXECUTING_USER</name>
        </field>
        <field>
          <id>CLIENT</id>
          <enabled>N</enabled>
          <name>CLIENT</name>
        </field>
      </trans-log-table>
      <perf-log-table>
        <connection/>
        <schema/>
        <table/>
        <interval/>
        <timeout_days/>
        <field>
          <id>ID_BATCH</id>
          <enabled>Y</enabled>
          <name>ID_BATCH</name>
        </field>
        <field>
          <id>SEQ_NR</id>
          <enabled>Y</enabled>
          <name>SEQ_NR</name>
        </field>
        <field>
          <id>LOGDATE</id>
          <enabled>Y</enabled>
          <name>LOGDATE</name>
        </field>
        <field>
          <id>TRANSNAME</id>
          <enabled>Y</enabled>
          <name>TRANSNAME</name>
        </field>
        <field>
          <id>STEPNAME</id>
          <enabled>Y</enabled>
          <name>STEPNAME</name>
        </field>
        <field>
          <id>STEP_COPY</id>
          <enabled>Y</enabled>
          <name>STEP_COPY</name>
        </field>
        <field>
          <id>LINES_READ</id>
          <enabled>Y</enabled>
          <name>LINES_READ</name>
        </field>
        <field>
          <id>LINES_WRITTEN</id>
          <enabled>Y</enabled>
          <name>LINES_WRITTEN</name>
        </field>
        <field>
          <id>LINES_UPDATED</id>
          <enabled>Y</enabled>
          <name>LINES_UPDATED</name>
        </field>
        <field>
          <id>LINES_INPUT</id>
          <enabled>Y</enabled>
          <name>LINES_INPUT</name>
        </field>
        <field>
          <id>LINES_OUTPUT</id>
          <enabled>Y</enabled>
          <name>LINES_OUTPUT</name>
        </field>
        <field>
          <id>LINES_REJECTED</id>
          <enabled>Y</enabled>
          <name>LINES_REJECTED</name>
        </field>
        <field>
          <id>ERRORS</id>
          <enabled>Y</enabled>
          <name>ERRORS</name>
        </field>
        <field>
          <id>INPUT_BUFFER_ROWS</id>
          <enabled>Y</enabled>
          <name>INPUT_BUFFER_ROWS</name>
        </field>
        <field>
          <id>OUTPUT_BUFFER_ROWS</id>
          <enabled>Y</enabled>
          <name>OUTPUT_BUFFER_ROWS</name>
        </field>
      </perf-log-table>
      <channel-log-table>
        <connection/>
        <schema/>
        <table/>
        <timeout_days/>
        <field>
          <id>ID_BATCH</id>
          <enabled>Y</enabled>
          <name>ID_BATCH</name>
        </field>
        <field>
          <id>CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>CHANNEL_ID</name>
        </field>
        <field>
          <id>LOG_DATE</id>
          <enabled>Y</enabled>
          <name>LOG_DATE</name>
        </field>
        <field>
          <id>LOGGING_OBJECT_TYPE</id>
          <enabled>Y</enabled>
          <name>LOGGING_OBJECT_TYPE</name>
        </field>
        <field>
          <id>OBJECT_NAME</id>
          <enabled>Y</enabled>
          <name>OBJECT_NAME</name>
        </field>
        <field>
          <id>OBJECT_COPY</id>
          <enabled>Y</enabled>
          <name>OBJECT_COPY</name>
        </field>
        <field>
          <id>REPOSITORY_DIRECTORY</id>
          <enabled>Y</enabled>
          <name>REPOSITORY_DIRECTORY</name>
        </field>
        <field>
          <id>FILENAME</id>
          <enabled>Y</enabled>
          <name>FILENAME</name>
        </field>
        <field>
          <id>OBJECT_ID</id>
          <enabled>Y</enabled>
          <name>OBJECT_ID</name>
        </field>
        <field>
          <id>OBJECT_REVISION</id>
          <enabled>Y</enabled>
          <name>OBJECT_REVISION</name>
        </field>
        <field>
          <id>PARENT_CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>PARENT_CHANNEL_ID</name>
        </field>
        <field>
          <id>ROOT_CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>ROOT_CHANNEL_ID</name>
        </field>
      </channel-log-table>
      <step-log-table>
        <connection/>
        <schema/>
        <table/>
        <timeout_days/>
        <field>
          <id>ID_BATCH</id>
          <enabled>Y</enabled>
          <name>ID_BATCH</name>
        </field>
        <field>
          <id>CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>CHANNEL_ID</name>
        </field>
        <field>
          <id>LOG_DATE</id>
          <enabled>Y</enabled>
          <name>LOG_DATE</name>
        </field>
        <field>
          <id>TRANSNAME</id>
          <enabled>Y</enabled>
          <name>TRANSNAME</name>
        </field>
        <field>
          <id>STEPNAME</id>
          <enabled>Y</enabled>
          <name>STEPNAME</name>
        </field>
        <field>
          <id>STEP_COPY</id>
          <enabled>Y</enabled>
          <name>STEP_COPY</name>
        </field>
        <field>
          <id>LINES_READ</id>
          <enabled>Y</enabled>
          <name>LINES_READ</name>
        </field>
        <field>
          <id>LINES_WRITTEN</id>
          <enabled>Y</enabled>
          <name>LINES_WRITTEN</name>
        </field>
        <field>
          <id>LINES_UPDATED</id>
          <enabled>Y</enabled>
          <name>LINES_UPDATED</name>
        </field>
        <field>
          <id>LINES_INPUT</id>
          <enabled>Y</enabled>
          <name>LINES_INPUT</name>
        </field>
        <field>
          <id>LINES_OUTPUT</id>
          <enabled>Y</enabled>
          <name>LINES_OUTPUT</name>
        </field>
        <field>
          <id>LINES_REJECTED</id>
          <enabled>Y</enabled>
          <name>LINES_REJECTED</name>
        </field>
        <field>
          <id>ERRORS</id>
          <enabled>Y</enabled>
          <name>ERRORS</name>
        </field>
        <field>
          <id>LOG_FIELD</id>
          <enabled>N</enabled>
          <name>LOG_FIELD</name>
        </field>
      </step-log-table>
      <metrics-log-table>
        <connection/>
        <schema/>
        <table/>
        <timeout_days/>
        <field>
          <id>ID_BATCH</id>
          <enabled>Y</enabled>
          <name>ID_BATCH</name>
        </field>
        <field>
          <id>CHANNEL_ID</id>
          <enabled>Y</enabled>
          <name>CHANNEL_ID</name>
        </field>
        <field>
          <id>LOG_DATE</id>
          <enabled>Y</enabled>
          <name>LOG_DATE</name>
        </field>
        <field>
          <id>METRICS_DATE</id>
          <enabled>Y</enabled>
          <name>METRICS_DATE</name>
        </field>
        <field>
          <id>METRICS_CODE</id>
          <enabled>Y</enabled>
          <name>METRICS_CODE</name>
        </field>
        <field>
          <id>METRICS_DESCRIPTION</id>
          <enabled>Y</enabled>
          <name>METRICS_DESCRIPTION</name>
        </field>
        <field>
          <id>METRICS_SUBJECT</id>
          <enabled>Y</enabled>
          <name>METRICS_SUBJECT</name>
        </field>
        <field>
          <id>METRICS_TYPE</id>
          <enabled>Y</enabled>
          <name>METRICS_TYPE</name>
        </field>
        <field>
          <id>METRICS_VALUE</id>
          <enabled>Y</enabled>
          <name>METRICS_VALUE</name>
        </field>
      </metrics-log-table>
    </log>
    <maxdate>
      <connection/>
      <table/>
      <field/>
      <offset>0.0</offset>
      <maxdiff>0.0</maxdiff>
    </maxdate>
    <size_rowset>10000</size_rowset>
    <sleep_time_empty>50</sleep_time_empty>
    <sleep_time_full>50</sleep_time_full>
    <unique_connections>N</unique_connections>
    <feedback_shown>Y</feedback_shown>
    <feedback_size>50000</feedback_size>
    <using_thread_priorities>Y</using_thread_priorities>
    <shared_objects_file/>
    <capture_step_performance>N</capture_step_performance>
    <step_performance_capturing_delay>1000</step_performance_capturing_delay>
    <step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
    <dependencies>
    </dependencies>
    <partitionschemas>
    </partitionschemas>
    <slaveservers>
    </slaveservers>
    <clusterschemas>
    </clusterschemas>
    <created_user>-</created_user>
    <created_date>2021/11/16 11:20:06.891</created_date>
    <modified_user>-</modified_user>
    <modified_date>2021/11/16 11:20:06.891</modified_date>
    <key_for_session_key>H4sIAAAAAAAAAAMAAAAAAAAAAAA=</key_for_session_key>
    <is_key_private>N</is_key_private>
  </info>
  <notepads>
  </notepads>
  <order>
    <hop>
      <from>Excel输入</from>
      <to>排序记录</to>
      <enabled>Y</enabled>
    </hop>
    <hop>
      <from>排序记录</from>
      <to>去除重复记录</to>
      <enabled>Y</enabled>
    </hop>
    <hop>
      <from>去除重复记录</from>
      <to>Excel输出</to>
      <enabled>Y</enabled>
    </hop>
  </order>
  <step>
    <name>Excel输入</name>
    <type>ExcelInput</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
    <partitioning>
      <method>none</method>
      <schema_name/>
    </partitioning>
    <header>Y</header>
    <noempty>Y</noempty>
    <stoponempty>N</stoponempty>
    <filefield/>
    <sheetfield/>
    <sheetrownumfield/>
    <rownumfield/>
    <sheetfield/>
    <filefield/>
    <limit>0</limit>
    <encoding/>
    <add_to_result_filenames>Y</add_to_result_filenames>
    <accept_filenames>N</accept_filenames>
    <accept_field/>
    <accept_stepname/>
    <file>
      <name>F:\kette_test\input\去除重复记录.xlsx</name>
      <filemask/>
      <exclude_filemask/>
      <file_required>N</file_required>
      <include_subfolders>N</include_subfolders>
    </file>
    <fields>
      <field>
        <name>id</name>
        <type>Number</type>
        <length>-1</length>
        <precision>-1</precision>
        <trim_type>none</trim_type>
        <repeat>N</repeat>
        <format/>
        <currency/>
        <decimal/>
        <group/>
      </field>
      <field>
        <name>name</name>
        <type>String</type>
        <length>-1</length>
        <precision>-1</precision>
        <trim_type>none</trim_type>
        <repeat>N</repeat>
        <format/>
        <currency/>
        <decimal/>
        <group/>
      </field>
      <field>
        <name>age</name>
        <type>Number</type>
        <length>-1</length>
        <precision>-1</precision>
        <trim_type>none</trim_type>
        <repeat>N</repeat>
        <format/>
        <currency/>
        <decimal/>
        <group/>
      </field>
    </fields>
    <sheets>
    </sheets>
    <strict_types>N</strict_types>
    <error_ignored>N</error_ignored>
    <error_line_skipped>N</error_line_skipped>
    <bad_line_files_destination_directory/>
    <bad_line_files_extension>warning</bad_line_files_extension>
    <error_line_files_destination_directory/>
    <error_line_files_extension>error</error_line_files_extension>
    <line_number_files_destination_directory/>
    <line_number_files_extension>line</line_number_files_extension>
    <shortFileFieldName/>
    <pathFieldName/>
    <hiddenFieldName/>
    <lastModificationTimeFieldName/>
    <uriNameFieldName/>
    <rootUriNameFieldName/>
    <extensionFieldName/>
    <sizeFieldName/>
    <spreadsheet_type>POI</spreadsheet_type>
    <attributes/>
    <cluster_schema/>
    <remotesteps>
      <input>
      </input>
      <output>
      </output>
    </remotesteps>
    <GUI>
      <xloc>336</xloc>
      <yloc>256</yloc>
      <draw>Y</draw>
    </GUI>
  </step>
  <step>
    <name>Excel输出</name>
    <type>ExcelOutput</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
    <partitioning>
      <method>none</method>
      <schema_name/>
    </partitioning>
    <header>Y</header>
    <footer>N</footer>
    <encoding/>
    <append>N</append>
    <add_to_result_filenames>Y</add_to_result_filenames>
    <file>
      <name>F:\kette_test\output\去除重复记录.xls</name>
      <extention/>
      <do_not_open_newfile_init>N</do_not_open_newfile_init>
      <create_parent_folder>N</create_parent_folder>
      <split>N</split>
      <add_date>N</add_date>
      <add_time>N</add_time>
      <SpecifyFormat>N</SpecifyFormat>
      <date_time_format/>
      <sheetname>Sheet1</sheetname>
      <autosizecolums>N</autosizecolums>
      <nullisblank>N</nullisblank>
      <protect_sheet>N</protect_sheet>
      <password>Encrypted </password>
      <splitevery>0</splitevery>
      <usetempfiles>N</usetempfiles>
      <tempdirectory/>
    </file>
    <template>
      <enabled>N</enabled>
      <append>N</append>
      <filename>template.xls</filename>
    </template>
    <fields>
      <field>
        <name>id</name>
        <type>Number</type>
        <format>0</format>
      </field>
      <field>
        <name>name</name>
        <type>String</type>
        <format/>
      </field>
      <field>
        <name>age</name>
        <type>Number</type>
        <format>0</format>
      </field>
    </fields>
    <custom>
      <header_font_name>arial</header_font_name>
      <header_font_size>10</header_font_size>
      <header_font_bold>N</header_font_bold>
      <header_font_italic>N</header_font_italic>
      <header_font_underline>no</header_font_underline>
      <header_font_orientation>horizontal</header_font_orientation>
      <header_font_color>black</header_font_color>
      <header_background_color>none</header_background_color>
      <header_row_height>255</header_row_height>
      <header_alignment>left</header_alignment>
      <header_image/>
      <row_font_name>arial</row_font_name>
      <row_font_size>10</row_font_size>
      <row_font_color>black</row_font_color>
      <row_background_color>none</row_background_color>
    </custom>
    <attributes/>
    <cluster_schema/>
    <remotesteps>
      <input>
      </input>
      <output>
      </output>
    </remotesteps>
    <GUI>
      <xloc>704</xloc>
      <yloc>272</yloc>
      <draw>Y</draw>
    </GUI>
  </step>
  <step>
    <name>去除重复记录</name>
    <type>Unique</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
    <partitioning>
      <method>none</method>
      <schema_name/>
    </partitioning>
    <count_rows>N</count_rows>
    <count_field/>
    <reject_duplicate_row>N</reject_duplicate_row>
    <error_description/>
    <fields>
      <field>
        <name>id</name>
        <case_insensitive>N</case_insensitive>
      </field>
      <field>
        <name>name</name>
        <case_insensitive>N</case_insensitive>
      </field>
      <field>
        <name>age</name>
        <case_insensitive>N</case_insensitive>
      </field>
    </fields>
    <attributes/>
    <cluster_schema/>
    <remotesteps>
      <input>
      </input>
      <output>
      </output>
    </remotesteps>
    <GUI>
      <xloc>560</xloc>
      <yloc>384</yloc>
      <draw>Y</draw>
    </GUI>
  </step>
  <step>
    <name>排序记录</name>
    <type>SortRows</type>
    <description/>
    <distribute>Y</distribute>
    <custom_distribution/>
    <copies>1</copies>
    <partitioning>
      <method>none</method>
      <schema_name/>
    </partitioning>
    <directory>%%java.io.tmpdir%%</directory>
    <prefix>out</prefix>
    <sort_size>1000000</sort_size>
    <free_memory/>
    <compress>N</compress>
    <compress_variable/>
    <unique_rows>N</unique_rows>
    <fields>
      <field>
        <name>id</name>
        <ascending>Y</ascending>
        <case_sensitive>N</case_sensitive>
        <collator_enabled>N</collator_enabled>
        <collator_strength>0</collator_strength>
        <presorted>N</presorted>
      </field>
      <field>
        <name>name</name>
        <ascending>Y</ascending>
        <case_sensitive>Y</case_sensitive>
        <collator_enabled>N</collator_enabled>
        <collator_strength>0</collator_strength>
        <presorted>N</presorted>
      </field>
      <field>
        <name>age</name>
        <ascending>Y</ascending>
        <case_sensitive>N</case_sensitive>
        <collator_enabled>N</collator_enabled>
        <collator_strength>0</collator_strength>
        <presorted>N</presorted>
      </field>
    </fields>
    <attributes/>
    <cluster_schema/>
    <remotesteps>
      <input>
      </input>
      <output>
      </output>
    </remotesteps>
    <GUI>
      <xloc>448</xloc>
      <yloc>384</yloc>
      <draw>Y</draw>
    </GUI>
  </step>
  <step_error_handling>
  </step_error_handling>
  <slave-step-copy-partition-distribution>
  </slave-step-copy-partition-distribution>
  <slave_transformation>N</slave_transformation>
  <attributes/>
</transformation>

        至此,关于如何通过ktr文件来写代码就到此吧,写了好几个简单案列了,后面忘了也能看看案例回忆,后续看看能不能整理如何继承Kettle的类,来重写其中的代码,写一些自己的东西。

posted @ 2021-11-23 14:18  伟衙内  阅读(52)  评论(0编辑  收藏  举报