lzhenf

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

某天有1千万条查询,大部分为重复的,可能只有300万条查询,每条查询的长度为1-255字节,请设计算法查找出最热门的10条查询

哈希 + 最小堆 时间复杂度为O(nlgk) n为数据量 , k为查询长度,这里为10;

#include <stdio.h>
#include <cstring>
#include <algorithm>
using namespace std;
#define HASHLEN 2807303
#define CHARLEN 30
typedef struct node_no_space* ptr_no_space;
typedef struct node_has_space * ptr_has_space;
 ptr_no_space  head[HASHLEN];
 
struct node_no_space
{
	char* word;
	int count;
	node_no_space * next;
};
struct node_has_space
{
	char word[CHARLEN];
	int count;
};
bool cmp(const node_has_space a ,const node_has_space b )
{
	return a.count > b.count ;
}
  int hash_funtion(char *p)
  {
	  int value = 0 ;
	  while ( *p != '\0')
	  {
		 value = value * 31 + *p++;
		 if ( value > HASHLEN)
			 value = value % HASHLEN;
	  }
	  return value;
  }
  void addwordToTable(char * str)
  {
		int index = hash_funtion(str);
	    ptr_no_space temp = head[index]; //判断头结点
		while ( temp != NULL ) 
		{
			if ( !strcmp(temp->word,str))
			{
				temp->count ++;
				return ;
			}
			temp = temp->next;
		}
		//不在任意的index里面,新开一条记录
		ptr_no_space new_list = new node_no_space;
		new_list->count =1;
		new_list->word = new char[strlen(str ) +1 ];
		strcpy(new_list->word , str);
		new_list->next = head[index];
		head[index] = new_list;
  }
//去除前后的特殊符号
void handle_symbol(char *str, int n)  
{  
    while (str[n] < '0' || (str[n] > '9' && str[n] < 'A') || (str[n] > 'Z' && str[n] < 'a') || str[n] > 'z')  
    {  
        str[n] = '\0';  
        n--;  
    }  
      
    while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z')  
    {  
        int i = 0;  
        while (i < n)  
        {  
            str[i] = str[i+1];  
            i++;  
        }  
        str[i] = '\0';  
        n--;  
    }  
}  
void write_to_file()
{
	FILE *fp = fopen("result.txt","w");
	for ( int i = 0 ; i < HASHLEN; i++)
	{
		ptr_no_space tmp = head[i];
		while (  tmp != NULL )
		{
			
			fprintf(fp,"%s %d\n" ,tmp->word , tmp->count);
			tmp = tmp->next ;
		}
	}
	fclose(fp);
}
int main()
{
	FILE *fp_read  = fopen("string.txt","r");
	
	char str[CHARLEN];
	for ( int i = 0 ; i < HASHLEN ; i++)
		head[i] = NULL;
	while ( fscanf(fp_read,"%s" , &str) != EOF)
	{
		 int n = strlen(str) - 1;  
        if (n > 0)  
            handle_symbol(str, n);  
        addwordToTable(str);//往哈希表中添加str
	}
	fclose(fp_read);
	write_to_file();//写入文件
	ptr_has_space heap = new node_has_space [10];
	FILE *fp_result = fopen("result.txt","r");
	int c;
	for ( int i = 0 ; i < 10 ; i++)
	{
		fscanf(fp_result,"%s %d" ,&str  ,&c);
		heap[i].count = c;
		strcpy(heap[i].word , str);
	}
	//建立最小堆
	make_heap(heap,heap+10,cmp);
	ptr_has_space p = new node_has_space;
	//不断读入result.txt中数据 , 维护最小堆
	while ( fscanf(fp_result,"%s %d" ,&p->word , &p->count) != EOF)
	{
		if ( p->count > heap[0].count)
		{
			heap[0].count = p->count;
			strcpy(heap[0].word , p->word);
			make_heap(heap , heap+10 , cmp);
		}
	}
	fclose(fp_result);
	//输出堆中结果
	sort_heap(heap,heap+10 ,cmp);
	for ( int i = 0 ; i < 10  ; i++)
		printf("%s %d\n", heap[i].word , heap[i].count);
	return 0 ;
}

  

posted on 2012-03-21 21:09  lzhenf  阅读(2021)  评论(0编辑  收藏  举报