决策表快速排序

一、说明。

所谓决策表,类似于关系数据库的二位数据表,形如:
4 3 0
1 0 1
8 1 0
1 2 0
1 2 1
7 3 1
7 4 0

排序后输出:

1 0 1
1 2 0
1 2 1
4 3 0
7 3 1
7 4 0
8 1 0

 

二、问题由来。
决策表约简是粗糙集的一个经典问题。
关于如何解释粗糙集约简问题,我有一个很简单的解释,不过不会在这里写出。
简而言之约简就是在保持原有数据集分类能力的前提下删除冗余属性。
粗糙集的创始者Pawlak有着一个近乎偏执的理念:知识就是分类。

完成分类是进一步完成粗糙集约简的基础。
所以针对如何分类就有了各种各样的解法。

蛮力算法就是两两比较,完成分类,这个复杂度很高。
在这种情况下,先排序再分类是一个进步的方法。
当然排序的方法也很多,基数排序、快速排序都是排序,也的确都有人进行过尝试。

我这里的这个排序方法来自于《计算机学报》上的一篇《属性序下的快速约简算法》。
文章的作者当时发了两篇文章,这篇约简的文章建立在另外一篇《二维表快速排序的复杂度分析》之上。

这里我只是简单实现了原文算法。

三、实现代码,只是想重复这个实验,然后用我的方法与此相比较。

  1 #include <stdlib.h>
  2 #include <string.h>
  3 #include <stdio.h>
  4 #include <math.h>
  5 #include <time.h>
  6 #include <windows.h>
  7 #include "decTable.h"
  8 
  9 const int AttOrderTerminator = -1;
 10 
 11 typedef struct tagConditionClass{
 12     int deputyRowNO; // start index NO. in tblIdx.
 13     int terminalRowNO; // terminal index NO. in tblIdx.
 14     bool available; // if cdncls has >= 2 decision value, it is not available.
 15 }ConditionClass;
 16 
 17 struct tagDecisionTableEX{
 18     DecisionTable * table;
 19     int * tblIdx;
 20     int from;
 21     int to;
 22 };
 23 typedef tagDecisionTableEX DecisionTableEX;
 24 
 25 int partition(DecisionTable * table, int * tblIdx, int stage, int low, int high);
 26 int TDQuicksort(DecisionTable * table, int * tblIdx, int stage, int low, int high);
 27 int loadDecisionTablePositiveRegion(DecisionTable * table, int * tblIdx, bool * tblPositiveRegion);
 28 int partitionMatrix(DecisionTableEX * tex, int * attOrder, int stage, int * nonEmptyLabel, bool * tblPositiveRegion);
 29 int attOrderReduction();
 30 
 31 int partition(DecisionTable * table, int * tblIdx, int stage, int low, int high){
 32     TableElement * s = table->dataCenter;
 33     int ext = table->extCdnAttribCount;
 34     int t;
 35 
 36     int mid = low;
 37     int hiEnd = mid+1;
 38     int counter = 0;
 39     for(int i=low+1; i<=high; i++){
 40         int ref = s[tblIdx[low] * ext + stage];
 41         int element = s[tblIdx[i] * ext + stage];
 42         if (element < ref){
 43             mid++;
 44             hiEnd++;
 45             t = tblIdx[mid];
 46             tblIdx[mid] = tblIdx[i];
 47             tblIdx[i] = t;
 48         }
 49         if(element == ref){
 50             t = tblIdx[i];
 51             tblIdx[i] = tblIdx[hiEnd];
 52             tblIdx[hiEnd] = t;
 53             hiEnd++;
 54             counter++;
 55         }
 56     }
 57 
 58     t = tblIdx[low];
 59     tblIdx[low] = tblIdx[mid];
 60     tblIdx[mid] = t;
 61 
 62     if (mid == low) return mid + counter;
 63 
 64     return mid-1;
 65 }
 66 
 67 int TDQuicksort(DecisionTable * table, int * tblIdx, int stage, int low, int high){
 68     TableElement * s = table->dataCenter;
 69     int ext = table->extCdnAttribCount;
 70     
 71     if (stage > table->cdnAttributeCount) return 0;
 72     if (low >= high) return 0;
 73 
 74     bool NextDemension = false;
 75     for (int i=low+1; i<=high; i++)
 76     if ( s[tblIdx[i] * ext + stage] != s[tblIdx[low] * ext + stage]){
 77         NextDemension = true;
 78         break;
 79     }
 80 
 81     if (NextDemension){
 82         int mid = partition(table, tblIdx, stage, low, high);
 83         TDQuicksort(table, tblIdx, stage, low, mid);
 84         TDQuicksort(table, tblIdx, stage, mid+1, high);
 85     }
 86 
 87     if (!NextDemension){
 88         TDQuicksort(table, tblIdx, stage+1, low, high);
 89     }
 90 
 91     return 0;
 92 }
 93 
 94 int loadDecisionTablePositiveRegion(DecisionTable * table, int * tblIdx, bool * tblPositiveRegion){
 95     int CdnEquClsNO = table->elementCount+2;
 96     int cdnClsPointer = 0;
 97 
 98     int cdn = table->cdnAttributeCount;
 99     int ext = table->extCdnAttribCount;
100     int tfsi = table->elementCount;
101     
102     ConditionClass * cdnCls = NULL;
103     HANDLE heap = NULL;
104 
105     int cc = table->cdnCmp;
106 
107     heap = HeapCreate(HEAP_NO_SERIALIZE|HEAP_GENERATE_EXCEPTIONS, 1024*1024, 0);
108     if (heap != NULL){
109         cdnCls = (ConditionClass * )HeapAlloc(heap, 0, CdnEquClsNO * sizeof(ConditionClass));
110     }
111     MakeSure(cdnCls != NULL);
112     SecureZeroMemory(cdnCls, CdnEquClsNO * sizeof(ConditionClass));
113 
114     int from = 0;
115     while(from < tfsi){
116         int duplicate = 0;
117         int i=from;
118         BigInt * src64 = (BigInt *)(table->dataCenter + tblIdx[from] * ext);
119 
120         cdnCls[cdnClsPointer].deputyRowNO = from; // index NO. in tblIdx
121         cdnCls[cdnClsPointer].available = true;
122 
123         // while (Line[from] == Line[i]) {...}
124         while (true){
125             bool bird = true;
126             if (i == tfsi) break;
127             BigInt * dst64 = (BigInt *)(table->dataCenter + tblIdx[i] * ext);
128             for(int m=0; m<cc; m++)
129             if(src64[m]^dst64[m]){
130                 bird = false;
131                 break;
132             }
133             if (!bird) break;
134 
135             if (cdnCls[cdnClsPointer].available == true)
136             if (table->dcnElement[tblIdx[i]] != table->dcnElement[tblIdx[from]]){
137                 cdnCls[cdnClsPointer].available = false;
138             }
139             
140             i++;
141             duplicate++;
142         }
143 
144         from += duplicate;
145         cdnCls[cdnClsPointer].terminalRowNO = from - 1;
146         cdnClsPointer++;
147     }
148 
149     for (int i=0; i<cdnClsPointer; i++){
150         int start = cdnCls[i].deputyRowNO;
151         int terminal = cdnCls[i].terminalRowNO;
152         if (cdnCls[i].available){
153             for (int m=start; m<=terminal; m++) tblPositiveRegion[tblIdx[m]] = true;
154         }
155         if (!cdnCls[i].available){
156             for (int m=start; m<=terminal; m++) tblPositiveRegion[tblIdx[m]] = false;
157         }
158     }
159 
160     HeapFree(heap, 0, cdnCls);
161     HeapDestroy(heap);
162 
163     return 0;
164 }
165 
166 int partitionMatrix(DecisionTableEX * tex, int * attOrder, int stage, int * nonEmptyLabel, bool * tblPositiveRegion){
167     DecisionTable * table = tex->table;
168     int * tblIdx = tex->tblIdx;
169     int from = tex->from;
170     int to = tex->to;
171     
172     while (attOrder[stage] != AttOrderTerminator){
173         if (from >= to) return 0;
174         
175         bool noPRelement = true;
176         for (int i=from; i<=to; i++)
177         if (tblPositiveRegion[tblIdx[i]]){
178             noPRelement = false;
179             break;
180         }
181         if(noPRelement) return 0;
182 
183         bool cannotDistinguishInStage = true;
184         int ext = table->extCdnAttribCount;
185         TableElement * s = table->dataCenter;        
186         for (int i=from; i<=to; i++)
187         if (s[tblIdx[i] * ext + attOrder[stage]] != s[tblIdx[from] * ext + attOrder[stage]]){
188             cannotDistinguishInStage = false;
189             break;
190         }
191         if (cannotDistinguishInStage) partitionMatrix(tex, attOrder, stage+1, nonEmptyLabel, tblPositiveRegion);
192 
193         if (cannotDistinguishInStage == false){
194             nonEmptyLabel[stage] = 1;
195             int sum = 0;
196             double avg = 0;
197             for (int i=from; i<=to; i++) sum += s[tblIdx[i] * ext + attOrder[stage]];
198             avg = ((double)sum) / (to - from + 1);
199             int mid = from -1;
200             for (int i=from; i<=to; i++){
201                 if (s[tblIdx[i] * ext + attOrder[stage]] <= avg){
202                     mid++;
203                     int t = tblIdx[mid];
204                     tblIdx[mid] = tblIdx[i];
205                     tblIdx[i] = t;
206                 }
207             }
208             //if (mid==from || mid==to) __debugbreak();
209             //int mid = partition(table, tblIdx, attOrder[stage], from, to);
210 
211             tex->from = from;
212             tex->to = mid;
213             partitionMatrix(tex, attOrder, stage, nonEmptyLabel, tblPositiveRegion);
214         
215             tex->from = mid+1;
216             tex->to = to;
217             partitionMatrix(tex, attOrder, stage, nonEmptyLabel, tblPositiveRegion);
218         }
219 
220         //printf("%d stage completed.\n", stage);
221         stage++;
222     }
223 
224     return 0;
225 }
226 
227 int attOrderReduction(){
228     DecisionTable table;
229     time_t timeBegin;
230     time_t timeEnd;
231     char fileName[MAX_STR];
232 
233     beginDecisionTable(&table);
234     
235     printf("\nInput data file name : ");
236     scanf_s("%s", fileName, MAX_STR);
237     strcat_s(fileName, MAX_STR, ".txt");
238     timeBegin = clock();
239     fillTableWithFile(&table, fileName);
240     timeEnd = clock();
241     printf("\n%f(s) consumed in reading from file", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC);
242     printf("\n");
243 
244     //reduction main
245     timeBegin = clock();
246     int * tblIdx = (int *)malloc(table.elementCount * sizeof(int));
247     bool * tblPositiveRegion = (bool *)malloc(table.elementCount * sizeof(bool));
248     for (int i=0; i<table.elementCount; i++) tblIdx[i]=i;
249     
250     TDQuicksort(&table, tblIdx, 0, 0, table.elementCount-1);
251     //TDQuicksort test code
252     //FILE * fp;
253     //fopen_s(&fp, "r8.txt", "w+");
254     //for (int i=0; i<table.elementCount; i++){
255     //    for (int m=0; m<table.cdnAttributeCount; m++)
256     //        fprintf(fp, "%8d", table.dataCenter[ tblIdx[i]*table.extCdnAttribCount + m ]);
257     //    fprintf(fp, "\n");
258     //}
259     //fclose(fp);
260     
261     return 0;
262 }
263 
264 int main(){
265     attOrderReduction();
266 
267     return 0;
268 }
View Code

四、实验结果摘录
所有实验数据来自UCI数据库。
实验机器: i3 2100 + 4G + win7 32bit
VS 2012 32bit Release Mode

Forest CoverType
581012 条数据,每数据 54 条件属性。
0.234s

Poker Hand
1025010 数据, 每数据 10条件属性。
0.359s

 

 

posted @ 2013-09-14 18:18  simcity  阅读(366)  评论(0编辑  收藏  举报