MySQL->处理重复数据[20180517]
限制数据重复的方式:表上增加主键(Primary Key)或增加唯一性索引(Unique)
主键对重复资料进行限制,这样资料在导入时就无法重复插入
create table primary_t( last_name varchar(20), first_name varchar(20), sex varchar(8), primary key(last_name,first_name) )engine=innodb default charset=utf8; show table status like 'primary_t'\G show index from primary_t\G 模拟插入两笔有主键约束的资料 insert into primary_t(last_name,first_name,sex) values('Frank','Li','Man'),('Frank','Li','Man'),('Coco','Li','Woman'); ERROR 1062 (23000): Duplicate entry 'Frank-Li' for key 'PRIMARY' insert into primary_t(last_name,first_name,sex) values('Frank','Li','Man'),('Frank','Cai','Man'),('Coco','Li','Woman'); mysql> select * from primary_t; +-----------+------------+-------+ | last_name | first_name | sex | +-----------+------------+-------+ | Coco | Li | Woman | | Frank | Cai | Man | | Frank | Li | Man | +-----------+------------+-------+ 3 rows in set (0.00 sec)
唯一性索引对复制资料进行限制
create table unique_t( last_name varchar(20), first_name varchar(20), sex varchar(8), unique index unique_name(last_name,first_name) )engine=innodb default charset=utf8; show table status like 'unique_t'\G show index from unique_t\G 模拟插入两笔有唯一性约束的资料 insert into unique_t(last_name,first_name,sex) values('Frank','Li','Man'),('Frank','Li','Man'),('Coco','Li','Woman'); ERROR 1062 (23000): Duplicate entry 'Frank-Li' for key 'unique_name' insert into unique_t(last_name,first_name,sex) values('Frank','Li','Man'),('Frank','Cai','Man'),('Coco','Li','Woman'); mysql> select * from unique_t; +-----------+------------+-------+ | last_name | first_name | sex | +-----------+------------+-------+ | Frank | Li | Man | | Frank | Cai | Man | | Coco | Li | Woman | +-----------+------------+-------+ 3 rows in set (0.00 sec)
对资料的统计
create table count_t( last_name varchar(20), first_name varchar(20), sex varchar(8) )engine=innodb default charset=utf8; insert into count_t values('Frank','A','Man'), ('Frank','B','Man'), ('Frank','C','Woman'), ('Frank','D','Man'), ('Frank','E','Man'), ('Frank','F','Woman'), ('Frank','G','Man'), ('Frank','H','Man'), ('Frank','I','Woman'), ('Coco','A','Woman'), ('Coco','B','Man'), ('Coco','C','Man'), ('Coco','D','Man'), ('Coco','E','Man'), ('Coco','F','Man'), ('Coco','G','Woman'), ('Coco','H','Woman'), ('Alex','B','Man'), ('ALex','C','Man'), ('ALex','D','Man'), ('Alex','E','Man'), ('ALex','F','Man'), ('ALex','G','Woman'), ('ALex','H','Woman') ; commit; 按各个列进行分组统计 select last_name,count(*) from count_t group by last_name; select first_name,count(*) from count_t group by first_name; select sex,count(*) from count_t group by sex; mysql> select last_name,count(*) from count_t group by last_name; +-----------+----------+ | last_name | count(*) | +-----------+----------+ | Alex | 7 | | Coco | 8 | | Frank | 9 | +-----------+----------+ 3 rows in set (0.00 sec) mysql> select first_name,count(*) from count_t group by first_name; +------------+----------+ | first_name | count(*) | +------------+----------+ | A | 2 | | B | 3 | | C | 3 | | D | 3 | | E | 3 | | F | 3 | | G | 3 | | H | 3 | | I | 1 | +------------+----------+ 9 rows in set (0.00 sec) mysql> select sex,count(*) from count_t group by sex; +-------+----------+ | sex | count(*) | +-------+----------+ | Man | 16 | | Woman | 8 | +-------+----------+ 2 rows in set (0.00 sec) 对分组的统计在进行限定,having 大于等于8次的记录 select last_name ,count(*) from count_t group by last_name having count(*) >=8; mysql> select last_name ,count(*) from count_t group by last_name having count(*) >=8; +-----------+----------+ | last_name | count(*) | +-----------+----------+ | Coco | 8 | | Frank | 9 | +-----------+----------+ 2 rows in set (0.00 sec)
对资料进行去重操作,主要使用distinct函数
select distinct last_name from count_t; select distinct first_name from count_t; select distinct sex from count_t;
对已有资料的表进行去重操作
insert into count_t values('Xi','Xi','Man'),('Xi','Xi','Man'); 增加主键约束,加入ignore,忽悠重复的资料 alter ignore table count_t add primary key(last_name,first_name); select * from count_t where last_name='Xi'; mysql> alter ignore table count_t -> add primary key(last_name,first_name); Query OK, 26 rows affected (0.01 sec) Records: 26 Duplicates: 1 Warnings: 0 mysql> select * from count_t where last_name='Xi'; +-----------+------------+------+ | last_name | first_name | sex | +-----------+------------+------+ | Xi | Xi | Man | +-----------+------------+------+ 1 row in set (0.00 sec)
使用group by方式也可去除重复资料
create table count_t0 as select last_name,first_name,sex from count_t group by last_name,first_name,sex;