给记录排序并去除重复记录
目录
一、流程图
如下图,本次重点是排序记录和去除重复记录两个节点说明。
排序记录界面配置如下,一般只需要填写字段列信息即可,其他信息使用默认的就行,
去除重复记录也是填写下面比较的字段即可,这里会比较id,name,age三个内容的值,如果一致那么就会去掉一条,当然也可以仅仅指定id,如果是相同的id那么也意味着相同记录。
二、 代码
怎么去根据ktr来写代码看前面的写的博客,这里就仅仅贴一下代码了,仅供参考。
排序记录
/**
* 排序记录
* @param transMeta
* @param registry
* @return
*/
private StepMeta getSortRowsStep(TransMeta transMeta, PluginRegistry registry){
SortRowsMeta sortRowsMeta = new SortRowsMeta();
sortRowsMeta.setDirectory("%%java.io.tmpdir%%");//<directory>%%java.io.tmpdir%%</directory>
sortRowsMeta.setPrefix("out");//<prefix>out</prefix>
sortRowsMeta.setSortSize("1000000");//<sort_size>1000000</sort_size>
sortRowsMeta.setCompressFiles(false);//<compress>N</compress>
//这里也可以看出,sortRowsMeta这个组件好像也可以只允许通过唯一的对象
sortRowsMeta.setOnlyPassingUniqueRows(false);//<unique_rows>N</unique_rows>
/*
<field>
<name>id</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<collator_enabled>N</collator_enabled>
<collator_strength>0</collator_strength>
<presorted>N</presorted>
</field>
*/
//排序字段
sortRowsMeta.setFieldName(new String[]{"id","name","age"});
//是否升序, true 升序
sortRowsMeta.setAscending(new boolean[]{true,true,true});
//是否大小写敏感 false 忽略大小写
sortRowsMeta.setCaseSensitive(new boolean[]{false,false,false});
sortRowsMeta.setCollatorEnabled(new boolean[]{false,false,false});
sortRowsMeta.setCollatorStrength(new int[]{0,0,0});
sortRowsMeta.setPreSortedField(new boolean[]{false,false,false});
String sortRowsPluginId = registry.getPluginId(StepPluginType.class, sortRowsMeta);
StepMeta sortRowsStep = new StepMeta(sortRowsPluginId, "排序记录", (StepMetaInterface) sortRowsMeta);
sortRowsStep.setDraw(true);
sortRowsStep.setLocation(448,384);
transMeta.addStep(sortRowsStep);
return sortRowsStep;
}
去除重复记录
/**
* 去除重复记录
* @param transMeta
* @param registry
* @return
*/
private StepMeta getUniqueStep(TransMeta transMeta, PluginRegistry registry){
UniqueRowsMeta uniqueRowsMeta = new UniqueRowsMeta();
uniqueRowsMeta.setCountRows(false);// <count_rows>N</count_rows>
uniqueRowsMeta.setRejectDuplicateRow(false);//<reject_duplicate_row>N</reject_duplicate_row>
uniqueRowsMeta.setCompareFields(new String[]{"id","name","age"});
uniqueRowsMeta.setCaseInsensitive(new boolean[]{false,false,false});
String uniqueRowsPluginId = registry.getPluginId(StepPluginType.class, uniqueRowsMeta);
StepMeta uniqueRowsStep = new StepMeta(uniqueRowsPluginId, "去除重复记录", (StepMetaInterface) uniqueRowsMeta);
uniqueRowsStep.setDraw(true);
uniqueRowsStep.setLocation(560,384);
transMeta.addStep(uniqueRowsStep);
return uniqueRowsStep;
}
三、运行
运行后结果如下图所示,排序了,并去除了重复记录,而且一次性通过,没有任何问题项。
完整代码
@Before
public void before() {
try {
//初始化环境
EnvUtil.environmentInit();
KettleEnvironment.init();
} catch (KettleException e) {
log.error("", e);
}
}
/**
* 去除重复记录前首先要排序,排序完才可以。
* 排序记录+去除重复记录 = 唯一行(哈希值)
* 唯一行(哈希值) 是将每一行都计算一个哈希值,然后比较行hash是否相同,相同则去除;
* 排序记录+去除重复记录 则是比较排序后相邻的记录是否相同,相同则去除;
* 理论上来说 hash值这个比较快。
*/
@Test
public void exchangeWithSortAndUnique() throws KettleException {
TransMeta transMeta = new TransMeta();
transMeta.setName("去除重复记录");
PluginRegistry registry = PluginRegistry.getInstance();
StepMeta inputStep = getInputStep(transMeta,registry);
StepMeta sortStep = getSortRowsStep(transMeta,registry);
StepMeta uniqueStep = getUniqueStep(transMeta,registry);
StepMeta outStep = getOutputStep(transMeta,registry);
/*
4. 关联步骤
*/
transMeta.addTransHop(new TransHopMeta(inputStep, sortStep));
transMeta.addTransHop(new TransHopMeta(sortStep, uniqueStep));
transMeta.addTransHop(new TransHopMeta(uniqueStep, outStep));
/*
5.执行
*/
Trans trans = new Trans(transMeta);
//执行转换
trans.execute(null);
//等待完成
trans.waitUntilFinished();
if (trans.getErrors() > 0) {
System.out.println("交换出错.");
return;
}
}
/**
* 去除重复记录
* @param transMeta
* @param registry
* @return
*/
private StepMeta getUniqueStep(TransMeta transMeta, PluginRegistry registry){
UniqueRowsMeta uniqueRowsMeta = new UniqueRowsMeta();
uniqueRowsMeta.setCountRows(false);// <count_rows>N</count_rows>
uniqueRowsMeta.setRejectDuplicateRow(false);//<reject_duplicate_row>N</reject_duplicate_row>
uniqueRowsMeta.setCompareFields(new String[]{"id","name","age"});
uniqueRowsMeta.setCaseInsensitive(new boolean[]{false,false,false});
String uniqueRowsPluginId = registry.getPluginId(StepPluginType.class, uniqueRowsMeta);
StepMeta uniqueRowsStep = new StepMeta(uniqueRowsPluginId, "去除重复记录", (StepMetaInterface) uniqueRowsMeta);
uniqueRowsStep.setDraw(true);
uniqueRowsStep.setLocation(560,384);
transMeta.addStep(uniqueRowsStep);
return uniqueRowsStep;
}
/**
* 排序记录
* @param transMeta
* @param registry
* @return
*/
private StepMeta getSortRowsStep(TransMeta transMeta, PluginRegistry registry){
SortRowsMeta sortRowsMeta = new SortRowsMeta();
sortRowsMeta.setDirectory("%%java.io.tmpdir%%");//<directory>%%java.io.tmpdir%%</directory>
sortRowsMeta.setPrefix("out");//<prefix>out</prefix>
sortRowsMeta.setSortSize("1000000");//<sort_size>1000000</sort_size>
sortRowsMeta.setCompressFiles(false);//<compress>N</compress>
//这里也可以看出,sortRowsMeta这个组件好像也可以只允许通过唯一的对象
sortRowsMeta.setOnlyPassingUniqueRows(false);//<unique_rows>N</unique_rows>
/*
<field>
<name>id</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<collator_enabled>N</collator_enabled>
<collator_strength>0</collator_strength>
<presorted>N</presorted>
</field>
*/
//排序字段
sortRowsMeta.setFieldName(new String[]{"id","name","age"});
//是否升序, true 升序
sortRowsMeta.setAscending(new boolean[]{true,true,true});
//是否大小写敏感 false 忽略大小写
sortRowsMeta.setCaseSensitive(new boolean[]{false,false,false});
sortRowsMeta.setCollatorEnabled(new boolean[]{false,false,false});
sortRowsMeta.setCollatorStrength(new int[]{0,0,0});
sortRowsMeta.setPreSortedField(new boolean[]{false,false,false});
String sortRowsPluginId = registry.getPluginId(StepPluginType.class, sortRowsMeta);
StepMeta sortRowsStep = new StepMeta(sortRowsPluginId, "排序记录", (StepMetaInterface) sortRowsMeta);
sortRowsStep.setDraw(true);
sortRowsStep.setLocation(448,384);
transMeta.addStep(sortRowsStep);
return sortRowsStep;
}
/**
* 获取输入
*
* @return
*/
private StepMeta getInputStep(TransMeta transMeta, PluginRegistry registry) {
ExcelInputMeta inputMeta = new ExcelInputMeta(); // <type>ExcelInput</type>
//文件路径
String filePath = "F:\\kette_test\\input\\去除重复记录.xlsx";
String[] fileName = new String[]{filePath};
inputMeta.setFileName(fileName); // <name>F:\kette_test\input\去除重复记录.xlsx</name>
String[] fileMasks = new String[1];
inputMeta.setFileMask(fileMasks); // <filemask/>
String[] fileExcludeMasks = new String[1];
inputMeta.setExcludeFileMask(fileExcludeMasks); // <exclude_filemask/>
String[] filerequireds = new String[]{"N"};
inputMeta.setFileRequired(filerequireds); // <file_required>N</file_required>
String[] subFolders = new String[]{"N"};
inputMeta.setIncludeSubFolders(subFolders); // <include_subfolders>N</include_subfolders>
inputMeta.setSpreadSheetType(SpreadSheetType.POI); // <spreadsheet_type>POI</spreadsheet_type>
/*
8.XXX 版本如果excel中有头部,那么需要设置,5.XXX版本不需要
*/
inputMeta.setStartsWithHeader(true);
//第二行开始
int[] startRow = new int[]{1};
inputMeta.setStartRow(startRow);
//第一列开始
int[] startColumn = new int[]{0};
inputMeta.setStartColumn(startColumn);
//字段列
String[] fieldsName = new String[]{"id", "name", "age"};
int[] fieldsType = new int[]{ValueMetaInterface.TYPE_NUMBER, ValueMetaInterface.TYPE_STRING, ValueMetaInterface.TYPE_NUMBER};
//Excel输入 step下的 <fields> .... </fields>
ExcelInputField[] excelInputFields = new ExcelInputField[fieldsName.length];
for (int i = 0; i < excelInputFields.length; i++) {
excelInputFields[i] = new ExcelInputField();
excelInputFields[i].setName(fieldsName[i]);
excelInputFields[i].setType(fieldsType[i]);
excelInputFields[i].setTrimType(ExcelInputMeta.TYPE_TRIM_NONE);
excelInputFields[i].setRepeated(false);
}
inputMeta.setField(excelInputFields);
/**
* 2.添加步骤到交换中
*/
String inputPluginId = registry.getPluginId(StepPluginType.class, inputMeta);
StepMeta inputStep = new StepMeta(inputPluginId, "Excel输入", (StepMetaInterface) inputMeta); //<step> --> <name>Excel输入</name>
inputStep.setDraw(true);
inputStep.setLocation(336,256);
transMeta.addStep(inputStep);
return inputStep;
}
/**
* 获取输出
*
* @return
*/
private StepMeta getOutputStep(TransMeta transMeta, PluginRegistry registry) {
ExcelOutputMeta outputMeta = new ExcelOutputMeta(); // <type>ExcelOutput</type>
outputMeta.setAppend(false); // <append>N</append>
outputMeta.setHeaderEnabled(true); // <header>Y</header>
outputMeta.setFooterEnabled(false);// <footer>N</footer>
//换个名字用于区分spoon运行的输出文件
outputMeta.setFileName("F:\\kette_test\\output\\去除重复记录2.xls"); // <name>F:\kette_test\output\字符串剪切替换操作</name>
outputMeta.setDoNotOpenNewFileInit(false); // <do_not_open_newfile_init>N</do_not_open_newfile_init>
outputMeta.setCreateParentFolder(false); // <create_parent_folder>N</create_parent_folder>
//字段列
String[] fieldsName = new String[]{"id", "name", "age"};
int[] fieldsType = new int[]{ValueMetaInterface.TYPE_NUMBER, ValueMetaInterface.TYPE_STRING, ValueMetaInterface.TYPE_NUMBER};
// <fields> ..... </fields>
ExcelField[] excelFields = new ExcelField[fieldsName.length];
for (int i = 0; i < excelFields.length; i++) {
excelFields[i] = new ExcelField();
excelFields[i].setName(fieldsName[i]);
excelFields[i].setType(fieldsType[i]);
/*
<field>
<name>id</name>
<type>Number</type>
<format>0</format>
</field>
*/
if(fieldsName[i].equals("id")){
excelFields[i].setFormat("0"); // <format>0</format>
}
if(fieldsName[i].equals("age")){
excelFields[i].setFormat("0"); // <format>0</format>
}
}
outputMeta.setOutputFields(excelFields);
String outPluginId = registry.getPluginId(StepPluginType.class, outputMeta);
StepMeta outputStep = new StepMeta(outPluginId, "Excel输出", (StepMetaInterface) outputMeta);// <step> --> <name>Excel输出</name>
outputStep.setDraw(true);
outputStep.setLocation(704,272);
transMeta.addStep(outputStep);
return outputStep;
}
ktr文件
<?xml version="1.0" encoding="UTF-8"?>
<transformation>
<info>
<name>去除重复记录</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<directory>/</directory>
<parameters>
</parameters>
<log>
<trans-log-table>
<connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STATUS</id>
<enabled>Y</enabled>
<name>STATUS</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
<subject/>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
<subject/>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
<subject/>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
<subject/>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
<subject/>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
<subject/>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>STARTDATE</id>
<enabled>Y</enabled>
<name>STARTDATE</name>
</field>
<field>
<id>ENDDATE</id>
<enabled>Y</enabled>
<name>ENDDATE</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>DEPDATE</id>
<enabled>Y</enabled>
<name>DEPDATE</name>
</field>
<field>
<id>REPLAYDATE</id>
<enabled>Y</enabled>
<name>REPLAYDATE</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>Y</enabled>
<name>LOG_FIELD</name>
</field>
<field>
<id>EXECUTING_SERVER</id>
<enabled>N</enabled>
<name>EXECUTING_SERVER</name>
</field>
<field>
<id>EXECUTING_USER</id>
<enabled>N</enabled>
<name>EXECUTING_USER</name>
</field>
<field>
<id>CLIENT</id>
<enabled>N</enabled>
<name>CLIENT</name>
</field>
</trans-log-table>
<perf-log-table>
<connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>SEQ_NR</id>
<enabled>Y</enabled>
<name>SEQ_NR</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>INPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>INPUT_BUFFER_ROWS</name>
</field>
<field>
<id>OUTPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>OUTPUT_BUFFER_ROWS</name>
</field>
</perf-log-table>
<channel-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>LOGGING_OBJECT_TYPE</id>
<enabled>Y</enabled>
<name>LOGGING_OBJECT_TYPE</name>
</field>
<field>
<id>OBJECT_NAME</id>
<enabled>Y</enabled>
<name>OBJECT_NAME</name>
</field>
<field>
<id>OBJECT_COPY</id>
<enabled>Y</enabled>
<name>OBJECT_COPY</name>
</field>
<field>
<id>REPOSITORY_DIRECTORY</id>
<enabled>Y</enabled>
<name>REPOSITORY_DIRECTORY</name>
</field>
<field>
<id>FILENAME</id>
<enabled>Y</enabled>
<name>FILENAME</name>
</field>
<field>
<id>OBJECT_ID</id>
<enabled>Y</enabled>
<name>OBJECT_ID</name>
</field>
<field>
<id>OBJECT_REVISION</id>
<enabled>Y</enabled>
<name>OBJECT_REVISION</name>
</field>
<field>
<id>PARENT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>PARENT_CHANNEL_ID</name>
</field>
<field>
<id>ROOT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>ROOT_CHANNEL_ID</name>
</field>
</channel-log-table>
<step-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>N</enabled>
<name>LOG_FIELD</name>
</field>
</step-log-table>
<metrics-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>METRICS_DATE</id>
<enabled>Y</enabled>
<name>METRICS_DATE</name>
</field>
<field>
<id>METRICS_CODE</id>
<enabled>Y</enabled>
<name>METRICS_CODE</name>
</field>
<field>
<id>METRICS_DESCRIPTION</id>
<enabled>Y</enabled>
<name>METRICS_DESCRIPTION</name>
</field>
<field>
<id>METRICS_SUBJECT</id>
<enabled>Y</enabled>
<name>METRICS_SUBJECT</name>
</field>
<field>
<id>METRICS_TYPE</id>
<enabled>Y</enabled>
<name>METRICS_TYPE</name>
</field>
<field>
<id>METRICS_VALUE</id>
<enabled>Y</enabled>
<name>METRICS_VALUE</name>
</field>
</metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user>-</created_user>
<created_date>2021/11/16 11:20:06.891</created_date>
<modified_user>-</modified_user>
<modified_date>2021/11/16 11:20:06.891</modified_date>
<key_for_session_key>H4sIAAAAAAAAAAMAAAAAAAAAAAA=</key_for_session_key>
<is_key_private>N</is_key_private>
</info>
<notepads>
</notepads>
<order>
<hop>
<from>Excel输入</from>
<to>排序记录</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>排序记录</from>
<to>去除重复记录</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>去除重复记录</from>
<to>Excel输出</to>
<enabled>Y</enabled>
</hop>
</order>
<step>
<name>Excel输入</name>
<type>ExcelInput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<header>Y</header>
<noempty>Y</noempty>
<stoponempty>N</stoponempty>
<filefield/>
<sheetfield/>
<sheetrownumfield/>
<rownumfield/>
<sheetfield/>
<filefield/>
<limit>0</limit>
<encoding/>
<add_to_result_filenames>Y</add_to_result_filenames>
<accept_filenames>N</accept_filenames>
<accept_field/>
<accept_stepname/>
<file>
<name>F:\kette_test\input\去除重复记录.xlsx</name>
<filemask/>
<exclude_filemask/>
<file_required>N</file_required>
<include_subfolders>N</include_subfolders>
</file>
<fields>
<field>
<name>id</name>
<type>Number</type>
<length>-1</length>
<precision>-1</precision>
<trim_type>none</trim_type>
<repeat>N</repeat>
<format/>
<currency/>
<decimal/>
<group/>
</field>
<field>
<name>name</name>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<trim_type>none</trim_type>
<repeat>N</repeat>
<format/>
<currency/>
<decimal/>
<group/>
</field>
<field>
<name>age</name>
<type>Number</type>
<length>-1</length>
<precision>-1</precision>
<trim_type>none</trim_type>
<repeat>N</repeat>
<format/>
<currency/>
<decimal/>
<group/>
</field>
</fields>
<sheets>
</sheets>
<strict_types>N</strict_types>
<error_ignored>N</error_ignored>
<error_line_skipped>N</error_line_skipped>
<bad_line_files_destination_directory/>
<bad_line_files_extension>warning</bad_line_files_extension>
<error_line_files_destination_directory/>
<error_line_files_extension>error</error_line_files_extension>
<line_number_files_destination_directory/>
<line_number_files_extension>line</line_number_files_extension>
<shortFileFieldName/>
<pathFieldName/>
<hiddenFieldName/>
<lastModificationTimeFieldName/>
<uriNameFieldName/>
<rootUriNameFieldName/>
<extensionFieldName/>
<sizeFieldName/>
<spreadsheet_type>POI</spreadsheet_type>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>336</xloc>
<yloc>256</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Excel输出</name>
<type>ExcelOutput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<header>Y</header>
<footer>N</footer>
<encoding/>
<append>N</append>
<add_to_result_filenames>Y</add_to_result_filenames>
<file>
<name>F:\kette_test\output\去除重复记录.xls</name>
<extention/>
<do_not_open_newfile_init>N</do_not_open_newfile_init>
<create_parent_folder>N</create_parent_folder>
<split>N</split>
<add_date>N</add_date>
<add_time>N</add_time>
<SpecifyFormat>N</SpecifyFormat>
<date_time_format/>
<sheetname>Sheet1</sheetname>
<autosizecolums>N</autosizecolums>
<nullisblank>N</nullisblank>
<protect_sheet>N</protect_sheet>
<password>Encrypted </password>
<splitevery>0</splitevery>
<usetempfiles>N</usetempfiles>
<tempdirectory/>
</file>
<template>
<enabled>N</enabled>
<append>N</append>
<filename>template.xls</filename>
</template>
<fields>
<field>
<name>id</name>
<type>Number</type>
<format>0</format>
</field>
<field>
<name>name</name>
<type>String</type>
<format/>
</field>
<field>
<name>age</name>
<type>Number</type>
<format>0</format>
</field>
</fields>
<custom>
<header_font_name>arial</header_font_name>
<header_font_size>10</header_font_size>
<header_font_bold>N</header_font_bold>
<header_font_italic>N</header_font_italic>
<header_font_underline>no</header_font_underline>
<header_font_orientation>horizontal</header_font_orientation>
<header_font_color>black</header_font_color>
<header_background_color>none</header_background_color>
<header_row_height>255</header_row_height>
<header_alignment>left</header_alignment>
<header_image/>
<row_font_name>arial</row_font_name>
<row_font_size>10</row_font_size>
<row_font_color>black</row_font_color>
<row_background_color>none</row_background_color>
</custom>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>704</xloc>
<yloc>272</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>去除重复记录</name>
<type>Unique</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<count_rows>N</count_rows>
<count_field/>
<reject_duplicate_row>N</reject_duplicate_row>
<error_description/>
<fields>
<field>
<name>id</name>
<case_insensitive>N</case_insensitive>
</field>
<field>
<name>name</name>
<case_insensitive>N</case_insensitive>
</field>
<field>
<name>age</name>
<case_insensitive>N</case_insensitive>
</field>
</fields>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>560</xloc>
<yloc>384</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>排序记录</name>
<type>SortRows</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<directory>%%java.io.tmpdir%%</directory>
<prefix>out</prefix>
<sort_size>1000000</sort_size>
<free_memory/>
<compress>N</compress>
<compress_variable/>
<unique_rows>N</unique_rows>
<fields>
<field>
<name>id</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<collator_enabled>N</collator_enabled>
<collator_strength>0</collator_strength>
<presorted>N</presorted>
</field>
<field>
<name>name</name>
<ascending>Y</ascending>
<case_sensitive>Y</case_sensitive>
<collator_enabled>N</collator_enabled>
<collator_strength>0</collator_strength>
<presorted>N</presorted>
</field>
<field>
<name>age</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<collator_enabled>N</collator_enabled>
<collator_strength>0</collator_strength>
<presorted>N</presorted>
</field>
</fields>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>448</xloc>
<yloc>384</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
<attributes/>
</transformation>
至此,关于如何通过ktr文件来写代码就到此吧,写了好几个简单案列了,后面忘了也能看看案例回忆,后续看看能不能整理如何继承Kettle的类,来重写其中的代码,写一些自己的东西。