瞌睡中的葡萄虎

博客园 首页 新随笔 联系 订阅 管理

测试数据位于:/home/hadoop/luogankun/workspace/sync_data/pig

dept和emp表来源自oracle数据库自带的表

dept.txt

10      ACCOUNTING      NEW YORK
20      RESEARCH        DALLAS
30      SALES   CHICAGO
40      OPERATIONS      BOSTON

emp.txt

7369    SMITH   CLERK   7902    1980-12-17      800.00          20
7499    ALLEN   SALESMAN        7698    1981-2-20       1600.00 300.00  30
7521    WARD    SALESMAN        7698    1981-2-22       1250.00 500.00  30
7566    JONES   MANAGER 7839    1981-4-2        2975.00         20
7654    MARTIN  SALESMAN        7698    1981-9-28       1250.00 1400.00 30
7698    BLAKE   MANAGER 7839    1981-5-1        2850.00         30
7782    CLARK   MANAGER 7839    1981-6-9        2450.00         10
7788    SCOTT   ANALYST 7566    1987-4-19       3000.00         20
7839    KING    PRESIDENT               1981-11-17      5000.00         10
7844    TURNER  SALESMAN        7698    1981-9-8        1500.00 0.00    30
7876    ADAMS   CLERK   7788    1987-5-23       1100.00         20
7900    JAMES   CLERK   7698    1981-12-3       950.00          30
7902    FORD    ANALYST 7566    1981-12-3       3000.00         20
7934    MILLER  CLERK   7782    1982-1-23       1300.00         10

 

上传数据到HDFS系统中

cd /home/hadoop/luogankun/workspace/sync_data/pig
hadoop fs -put dept.txt input/pig/dept.txt
hadoop fs -put emp.txt input/pig/emp.txt

 

 

导入

CREATE TABLE TMP_TABLE(USER VARCHAR(32),AGE INT,IS_MALE BOOLEAN);

dept= LOAD 'input/pig/dept.txt' USING PigStorage('\t') AS (deptno:int,dname:chararray,loc:chararray);
emp = LOAD 'input/pig/emp.txt' USING PigStorage('\t') AS (empno:int,ename:chararray, job:chararray, mgr:int, hiredate:chararray,sal:double,comm:double,dept:int);

 

 

查看表结构

desc TMP_TABLE;
describe dept dept: {deptno:
int,dname: chararray,loc: chararray} describe emp emp: {empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int}

 

 

查询整张表

SELECT * FROM TMP_TABLE;
DUMP dept (10,ACCOUNTING,NEW YORK) (20,RESEARCH,DALLAS) (30,SALES,CHICAGO) (40,OPERATIONS,BOSTON) DUMP emp (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)

 

 

查询前N条

SELECT * FROM TMP_TABLE LIMIT 10;
emp_table_limit
= LIMIT emp 10; DUMP emp_table_limit; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)

 

查询表中的某些列

SELECT USER FROM TMP_TABLE;
emp_table_empno_ename
= FOREACH emp GENERATE empno, ename; describe emp_table_empno_ename emp_table_empno_ename: {empno: int,ename: chararray} DUMP emp_table_empno_ename; (7369,SMITH) (7499,ALLEN) (7521,WARD) (7566,JONES) (7654,MARTIN) (7698,BLAKE) (7782,CLARK) (7788,SCOTT) (7839,KING) (7844,TURNER) (7876,ADAMS) (7900,JAMES) (7902,FORD) (7934,MILLER)

 

给列取别名

SELECT USER AS USER_NAME,AGE AS USER_AGE FROM TMP_TABLE;
emp_table_column_alias
= FOREACH emp GENERATE empno AS id,ename AS name; describe emp_table_column_alias emp_table_column_alias: {id: int,name: chararray} DUMP emp_table_column_alias (7369,SMITH) (7499,ALLEN) (7521,WARD) (7566,JONES) (7654,MARTIN) (7698,BLAKE) (7782,CLARK) (7788,SCOTT) (7839,KING) (7844,TURNER) (7876,ADAMS) (7900,JAMES) (7902,FORD) (7934,MILLER)

 

排序

SELECT * FROM TMP_TABLE ORDER BY AGE;
emp_table_order
= ORDER emp BY empno ASC; DUMP emp_table_order; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)

 

条件查询

SELECT * FROM TMP_TABLE WHERE AGE>20;
emp_table_where
= FILTER emp by sal > 1500; DUMP emp_table_where; (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20)

 

内连接Inner Join

SELECT * FROM TMP_TABLE A JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_inner_join
= JOIN emp BY dept,dept BY deptno; describe emp_table_inner_join emp_table_inner_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_inner_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)

 

左连接Left Join

SELECT * FROM TMP_TABLE A LEFT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_left_join
= JOIN emp BY dept LEFT OUTER,dept BY deptno; describe emp_table_left_join emp_table_left_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_left_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)

 

右连接Right Join

SELECT * FROM TMP_TABLE A RIGHT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_right_join
= JOIN emp BY dept RIGHT OUTER,dept BY deptno; describe emp_table_right_join emp_table_right_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_right_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (,,,,,,,,40,OPERATIONS,BOSTON)

 

全连接Full Join

SELECT * FROM TMP_TABLE A  JOIN TMP_TABLE_2 B ON A.AGE=B.AGE
emp_table_full_join
= JOIN emp BY dept FULL OUTER,dept BY deptno; describe emp_table_full_join emp_table_full_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_full_join; (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (,,,,,,,,40,OPERATIONS,BOSTON)

 

同时对多张表交叉查询

SELECT * FROM TMP_TABLE,TMP_TABLE_2;
emp_table_cross
= CROSS emp,dept; describe emp_table_cross emp_table_cross: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int, emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int, dept::deptno: int,dept::dname: chararray,dept::loc: chararray} DUMP emp_table_cross; (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,10,ACCOUNTING,NEW YORK) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,30,SALES,CHICAGO) (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,40,OPERATIONS,BOSTON) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,10,ACCOUNTING,NEW YORK) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,20,RESEARCH,DALLAS) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO) (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,40,OPERATIONS,BOSTON) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,10,ACCOUNTING,NEW YORK) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,20,RESEARCH,DALLAS) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,40,OPERATIONS,BOSTON) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,10,ACCOUNTING,NEW YORK) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,30,SALES,CHICAGO) (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,40,OPERATIONS,BOSTON) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,10,ACCOUNTING,NEW YORK) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,20,RESEARCH,DALLAS) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,40,OPERATIONS,BOSTON) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,10,ACCOUNTING,NEW YORK) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,20,RESEARCH,DALLAS) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO) (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,40,OPERATIONS,BOSTON) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,20,RESEARCH,DALLAS) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,30,SALES,CHICAGO) (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,40,OPERATIONS,BOSTON) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,10,ACCOUNTING,NEW YORK) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,30,SALES,CHICAGO) (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,40,OPERATIONS,BOSTON) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,20,RESEARCH,DALLAS) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,30,SALES,CHICAGO) (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,40,OPERATIONS,BOSTON) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,10,ACCOUNTING,NEW YORK) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,20,RESEARCH,DALLAS) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,40,OPERATIONS,BOSTON) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,10,ACCOUNTING,NEW YORK) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,30,SALES,CHICAGO) (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,40,OPERATIONS,BOSTON) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,10,ACCOUNTING,NEW YORK) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,20,RESEARCH,DALLAS) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO) (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,40,OPERATIONS,BOSTON) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,10,ACCOUNTING,NEW YORK) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,30,SALES,CHICAGO) (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,40,OPERATIONS,BOSTON) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,20,RESEARCH,DALLAS) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,30,SALES,CHICAGO) (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,40,OPERATIONS,BOSTON)

 

分组GROUP BY

SELECT * FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group
= GROUP emp BY dept; describe emp_table_group emp_table_group: { group: int, emp: { (empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double, comm: double,dept: int) } } DUMP emp_table_group; (10,{ (7839,KING,PRESIDENT,,1981-11-17,5000.0,,10), (7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10), (7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)}) (20,{ (7369,SMITH,CLERK,7902,1980-12-17,800.0,,20), (7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20), (7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20), (7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20), (7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)}) (30,{ (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30), (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30), (7900,JAMES,CLERK,7698,1981-12-3,950.0,,30), (7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30), (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30), (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)})

 

分组并统计

SELECT IS_MALE,COUNT(*) FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group_count
= GROUP emp BY dept; describe emp_table_group_count emp_table_group_count: { group: int, emp: { (empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int ) } } emp_table_group_count = FOREACH emp_table_group_count GENERATE group,COUNT($1); describe emp_table_group_count DUMP emp_table_group_count; (10,3) (20,5) (30,6) emp_table_group_count = FOREACH emp_table_group_count GENERATE emp.dept,COUNT($1); describe emp_table_group_count emp_table_group_count: {{(dept: int)},long} DUMP emp_table_group_count; ({(10),(10),(10)},3) ({(20),(20),(20),(20),(20)},5) ({(30),(30),(30),(30),(30),(30)},6)

 

去重DISTINCT

SELECT DISTINCT IS_MALE FROM TMP_TABLE;
emp_table_distinct
= FOREACH emp GENERATE dept; describe emp_table_distinct emp_table_distinct: {dept: int} emp_table_distinct = DISTINCT emp_table_distinct; describe emp_table_distinct emp_table_distinct: {dept: int} DUMP emp_table_distinct; (10) (20) (30)

 

 

pig判空

select * from emp where comm is not null;
emp_table_where_null
= FILTER emp by comm is not null; DUMP emp_table_where_null; (7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30) (7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30) (7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30) (7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)

 

posted on 2014-08-07 14:55  瞌睡中的葡萄虎  阅读(438)  评论(0编辑  收藏  举报