某天有1千万条查询,大部分为重复的,可能只有300万条查询,每条查询的长度为1-255字节,请设计算法查找出最热门的10条查询
哈希 + 最小堆 时间复杂度为O(nlgk) n为数据量 , k为查询长度,这里为10;
#include <stdio.h> #include <cstring> #include <algorithm> using namespace std; #define HASHLEN 2807303 #define CHARLEN 30 typedef struct node_no_space* ptr_no_space; typedef struct node_has_space * ptr_has_space; ptr_no_space head[HASHLEN]; struct node_no_space { char* word; int count; node_no_space * next; }; struct node_has_space { char word[CHARLEN]; int count; }; bool cmp(const node_has_space a ,const node_has_space b ) { return a.count > b.count ; } int hash_funtion(char *p) { int value = 0 ; while ( *p != '\0') { value = value * 31 + *p++; if ( value > HASHLEN) value = value % HASHLEN; } return value; } void addwordToTable(char * str) { int index = hash_funtion(str); ptr_no_space temp = head[index]; //判断头结点 while ( temp != NULL ) { if ( !strcmp(temp->word,str)) { temp->count ++; return ; } temp = temp->next; } //不在任意的index里面,新开一条记录 ptr_no_space new_list = new node_no_space; new_list->count =1; new_list->word = new char[strlen(str ) +1 ]; strcpy(new_list->word , str); new_list->next = head[index]; head[index] = new_list; } //去除前后的特殊符号 void handle_symbol(char *str, int n) { while (str[n] < '0' || (str[n] > '9' && str[n] < 'A') || (str[n] > 'Z' && str[n] < 'a') || str[n] > 'z') { str[n] = '\0'; n--; } while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z') { int i = 0; while (i < n) { str[i] = str[i+1]; i++; } str[i] = '\0'; n--; } } void write_to_file() { FILE *fp = fopen("result.txt","w"); for ( int i = 0 ; i < HASHLEN; i++) { ptr_no_space tmp = head[i]; while ( tmp != NULL ) { fprintf(fp,"%s %d\n" ,tmp->word , tmp->count); tmp = tmp->next ; } } fclose(fp); } int main() { FILE *fp_read = fopen("string.txt","r"); char str[CHARLEN]; for ( int i = 0 ; i < HASHLEN ; i++) head[i] = NULL; while ( fscanf(fp_read,"%s" , &str) != EOF) { int n = strlen(str) - 1; if (n > 0) handle_symbol(str, n); addwordToTable(str);//往哈希表中添加str } fclose(fp_read); write_to_file();//写入文件 ptr_has_space heap = new node_has_space [10]; FILE *fp_result = fopen("result.txt","r"); int c; for ( int i = 0 ; i < 10 ; i++) { fscanf(fp_result,"%s %d" ,&str ,&c); heap[i].count = c; strcpy(heap[i].word , str); } //建立最小堆 make_heap(heap,heap+10,cmp); ptr_has_space p = new node_has_space; //不断读入result.txt中数据 , 维护最小堆 while ( fscanf(fp_result,"%s %d" ,&p->word , &p->count) != EOF) { if ( p->count > heap[0].count) { heap[0].count = p->count; strcpy(heap[0].word , p->word); make_heap(heap , heap+10 , cmp); } } fclose(fp_result); //输出堆中结果 sort_heap(heap,heap+10 ,cmp); for ( int i = 0 ; i < 10 ; i++) printf("%s %d\n", heap[i].word , heap[i].count); return 0 ; }