Post List

2015년 8월 3일 월요일

MongoDB Study #06 추출과 분석 : Aggregation FrameWork

1. 특징

  - Big Data 추출에 최적화 되어 만들어진 기능으로 V 2.2 부터 지원되었습니다.
  - 내무적으로는 MapReduce를 사용하고 있으며 빠른 성능을 보장합니다.
    (MapReduce는 Javascript로 생성되었습니다.)
  - 외부 Data 처리에는 제한적입니다. MongoDB 내의 Data만 처리할 수 있습니다.

2. 함수 구조

총 6개의 연결 연산자로 이루어져 있습니다.

$project : SELECT
$match : WHERE
$group : GROUP BY
$sort : ORDER BY
$limit
$skip 

db.emp.aggregate(
{$project: {_id:0 , empno:1 , ename:{$toLower:"$ename"}, substr_name:{%substr:["$ename",1,2]}, sal:1}},
{$match: {$or: [deptno:10, deptno:20]},
{$group: {_id: "$deptno", addTotalAmount:{$sum:"$sal"}, avgTotalAmount:{$avg:"$sal"}}})

SELECT empno, lower(ename), substr(ename,1,2), SUM(sal), AVG(sal)
  FROM emp
 WHERE deptno = 10 OR deptno = 20
GROUP BY deptno;

이제 실습을 통해서 익혀보도록 하겠습니다.
실습에 사용한 Collection은 앞에 05에서 사용한 employees 를 그대로 사용할 예정입니다.
편의상 emp로 이름을 변경하겠습니다.

> db.employees.renameCollection("emp")

SELECT empno, LOWER(ename) ename, UPPER(job) job, SUBSTR(ename, 1, 2) substr_name, STRCMP(ename,'SJYUN') str_compare, sal
  FROM emp
 WHERE (deptno = 10 AND sal BETWEEN 500 AND 3000) AND (job = 'CLERK' OR job  = 'SALEMAN');

> db.emp.aggregate(
$match : { $and : [ { deptno:10 } , { sal : { $gte:500 , $lte:3000 } } ] } },
$match : { $or   : [ { job:"CLERK" } , { job:"SALESMAN" } ] } },
$project : { 
    _id : 0,
    empno : 1,
    ename : { $toLower : "$ename" },
    job : { $toUpper : "$job" },
    substr_name : { $substr : [ "$ename" , 1 , 2 ] },
    str_compare : { $strcasecmp : [ "$ename" , "SJYUN" ] } , sal : 1 }
} )

{ "empno" : 7934, "ename" : "clerk", "job" : "CLERK", "sal" : 1300, "substr_name" : "LE", "str_compare" : -1 }

SELECT empno, sal, comm FROM emp WHERE deptno = 30

> db.emp.aggregate ( 
{ $match : { deptno:30 } } ,
{ $project : { _id : 0 , empno : 1 , stats : { sal : "$sal" , comm : "$comm" } } } // 2개의 field를 묶어서 sub-domument로 저장
)

{ "empno" : 7499, "stats" : { "sal" : 1600, "comm" : 300 } }
{ "empno" : 7521, "stats" : { "sal" : 1250, "comm" : 500 } }
{ "empno" : 7654, "stats" : { "sal" : 1250, "comm" : 1400 } }
{ "empno" : 7698, "stats" : { "sal" : 2850 } }
{ "empno" : 7844, "stats" : { "sal" : 1500 } }
{ "empno" : 7900, "stats" : { "sal" : 950 } }

SELECT empno, ename, sal, NVL(comm,0) comm,
       (sal + NVL(comm,0)) sum_avg_add,
       (sal - NVL(comm,0)) sum_avg_subtract,
       (sal * 2) sum_avg_multiply,
       (sal / 2) sum_avg_divide
  FROM emp
 WHERE deptno = 30

> db.emp.aggregate (
{ $match : { deptno : 30 } } ,
{ $project : {
    _id : 0 ,
    empno : 1 ,
    ename : 1 ,
    sal : 1 ,
    comm : { $ifNull : [ "$comm" , 0 ] } ,
    sum_avg_add : { $add : [ "$sal" , { $ifNull : [ "$comm" , 0 ] } ] } ,
    sum_avg_subtract : { $subtract : [ "$sal" , { $ifNull : [ "$comm" , 0 ] } ] } ,
    sum_avg_multiply : { $multiply : [ "$sal" , 2 ] } ,
    sum_avg_divide : { $divide : ["$sal", 2] } }
} )

{ "empno" : 7499, "ename" : "ALLEN", "sal" : 1600, "comm" : 300, "sum_avg_add" : 1900, "sum_avg_subtract" : 1300, "sum_avg_multiply" : 3200, "sum_avg_divide" : 800 }
{ "empno" : 7521, "ename" : "WARD", "sal" : 1250, "comm" : 500, "sum_avg_add" : 1750, "sum_avg_subtract" : 750, "sum_avg_multiply" : 2500, "sum_avg_divide" : 625 }
{ "empno" : 7654, "ename" : "MARTIN", "sal" : 1250, "comm" : 1400, "sum_avg_add" : 2650, "sum_avg_subtract" : -150, "sum_avg_multiply" : 2500, "sum_avg_divide" : 625 }
{ "empno" : 7698, "ename" : "BLAKE", "sal" : 2850, "comm" : 0, "sum_avg_add" : 2850, "sum_avg_subtract" : 2850, "sum_avg_multiply" : 5700, "sum_avg_divide" : 1425 }
{ "empno" : 7844, "ename" : "TURNER", "sal" : 1500, "comm" : 0, "sum_avg_add" : 1500, "sum_avg_subtract" : 1500, "sum_avg_multiply" : 3000, "sum_avg_divide" : 750 }
{ "empno" : 7900, "ename" : "JAMES", "sal" : 950, "comm" : 0, "sum_avg_add" : 950, "sum_avg_subtract" : 950, "sum_avg_multiply" : 1900, "sum_avg_divide" : 475 }

SELECT COUNT(*), SUM(SAL), AVG(SAL), MAX(SAL), MIN(SAL), SUM(SAL - NVL(COMM,0)), SUM(SAL * 2), SUM(SAL / 2) FROM EMP
// $first , $last 에 대한 적당한 함수를 모르겠네요. ㅠㅠ

> db.emp.aggregate (
$project : {
    _id : 0 ,
    empno : 1 ,
    ename : 1 ,
    sal : 1 ,
    comm : { $ifNull : [ "$comm" , 0 ] } ,
    sum_avg_add : { $add : [ "$sal" , { $ifNull : [ "$comm" , 0 ] } ] } ,
    sum_avg_subtract : { $subtract : [ "$sal" , { $ifNull : [ "$comm" , 0 ] } ] } ,
    sum_avg_multiply : { $multiply : [ "$sal" , 2 ] } ,
    sum_avg_divide : { $divide : ["$sal", 2] } }
} ,
{ $group : {
    _id : 0, // 전체 Data 집계
    numTotalAmout : { $sum : 1 } ,
    addTotalAmount : { $sum : "$sal" } ,
    avgTotalAmount : { $avg : "$sal" } ,
    maxAmount : { $max : "$sal" } ,
    minAmount : { $min : "$sal" } ,
    firstEmpno : { $first : "$empno" } , // 첫번째 값
    lastEmpno : { $last : "$empno" } ,  // 마지막 값
    addAmount : { $sum : "$sum_avg_subtract" } ,
    multiplyAmount : { $sum : "$sum_avg_multiply" } ,
    divideAmount : { $sum : "$sum_avg_divide" }
} } )

{ "_id" : 0, "numTotalAmout" : 14, "addTotalAmount" : 29025, "avgTotalAmount" : 2073.214285714286, "maxAmount" : 5000, "minAmount" : 800, "firstEmpno" : 7369, "lastEmpno" : 7934, "addAmount" : 26825, "multiplyAmount" : 58050, "divideAmount" : 14512.5 }

SELECT COUNT(*), SUM(SAL), AVG(SAL), MAX(SAL), MIN(SAL), FROM EMP GROUP BY DEPTNO

> db.emp.aggregate (
$group : {
    _id : "$deptno",
    numTotalAmout : { $sum : 1 } ,
    addTotalAmount : { $sum : "$sal" } ,
    avgTotalAmount : { $avg : "$sal" } ,
    maxAmount : { $max : "$sal" } ,
    minAmount : { $min : "$sal" } ,
    firstEmpno : { $first : "$empno" } ,
    lastEmpno : { $last : "$empno" }   
} } )

{ "_id" : 10, "numTotalAmout" : 3, "addTotalAmount" : 8750, "avgTotalAmount" : 2916.6666666666665, "maxAmount" : 5000, "minAmount" : 1300, "firstEmpno" : 7782, "lastEmpno" : 7934 }
{ "_id" : 30, "numTotalAmout" : 6, "addTotalAmount" : 9400, "avgTotalAmount" : 1566.6666666666667, "maxAmount" : 2850, "minAmount" : 950, "firstEmpno" : 7499, "lastEmpno" : 7900 }
{ "_id" : 20, "numTotalAmout" : 5, "addTotalAmount" : 10875, "avgTotalAmount" : 2175, "maxAmount" : 3000, "minAmount" : 800, "firstEmpno" : 7369, "lastEmpno" : 7902 }

아래 문장에 적당한 SQL문을 못만들겠어요. ㅠㅠ

> db.emp.aggregate (
$group : {
    _id : "$deptno",
    enames : { $addToSet : "$ename" }  // 배열로 생성
} } )

{ "_id" : 10, "enames" : [ "CLERK", "PRESIDENT", "CLARK" ] }
{ "_id" : 30, "enames" : [ "TURNER", "BLAKE", "JAMES", "MARTIN", "WARD", "ALLEN" ] }
{ "_id" : 20, "enames" : [ "FORD", "ADAMS", "SCOTT", "JONES", "SMITH" ] }

SELECT EMPNO, ENAME FROM EMP WHERE DEPTNO = 30 ORDER BY EMPNO

> db.emp.aggregate (
{ $match : { deptno : 30 } } ,
{ $sort : { empno : 1 } } ,
{ $project : { _id : 0 , empno : 1 , ename : 1 } }
)

{ "empno" : 7499, "ename" : "ALLEN" }
{ "empno" : 7521, "ename" : "WARD" }
{ "empno" : 7654, "ename" : "MARTIN" }
{ "empno" : 7698, "ename" : "BLAKE" }
{ "empno" : 7844, "ename" : "TURNER" }
{ "empno" : 7900, "ename" : "JAMES" }

SELECT EMPNO, ENAME FROM EMP WHERE ROWNUM < 6
// $skip에 대해서는 잘 못하겠어요. ㅠㅠ sub-query 만들어서 정렬해서 ROWNUM을 쓰면 되겠지만, 귀찮귀찮 ㅎ ;;;

> db.emp.aggregate (
{ $limit : 5 } , // 5개의 document만 추출
{ $skip : 1 } , // 1개의 document를 제외하고 2번째 부터 추출
{ $sort : { empno : 1 } } ,
$project : { _id : 0 , empno : 1 , ename : 1 } }
)

{ "empno" : 7499, "ename" : "ALLEN" }
{ "empno" : 7521, "ename" : "WARD" }
{ "empno" : 7566, "ename" : "JONES" }
{ "empno" : 7654, "ename" : "MARTIN" }

SELECT EMPNO, ENAME, JOB, DECODE(COMM, NULL, 0, 1) FROM EMP WHERE DEPTNO = 30 ORDER BY EMPNO;

> db.emp.aggregate (
$match : { deptno : 30 } } ,
$sort : { empno : 1 } } ,
$project : { _id : 0 , empno : 1 , ename : 1, job : 1 , comm : 1 ,
    condition : { $cond : [ "$comm" , 1 , 0 ]  } } } // comm field 값이 Null이 아니면 True, Null이면 False
)

{ "empno" : 7499, "ename" : "ALLEN", "job" : "SALESMAN", "comm" : 300, "condition" : 1 }
{ "empno" : 7521, "ename" : "WARD", "job" : "SALESMAN", "comm" : 500, "condition" : 1 }
{ "empno" : 7654, "ename" : "MARTIN", "job" : "SALESMAN", "comm" : 1400, "condition" : 1 }
{ "empno" : 7698, "ename" : "BLAKE", "job" : "MANAGER", "condition" : 0 }
{ "empno" : 7844, "ename" : "TURNER", "job" : "SALESMAN", "condition" : 0 }
{ "empno" : 7900, "ename" : "JAMES", "job" : "CLERK", "condition" : 0 }

먼저 emp collection을 drop() 한 다음 hiredate를 날짜값으로 insert 다시 하여 실행하세요.

db.emp.drop()

db.emp.insert({empno:7369 , ename : "SMITH", job : "CLERK", hiredate : ISODate("1980-12-17"), sal : 800, deptno : 20 }) 
db.emp.insert({empno:7499 , ename : "ALLEN", job : "SALESMAN", hiredate : ISODate("1981-02-20"), sal :1600, comm : 300, deptno : 30 })
db.emp.insert({empno:7521 , ename : "WARD", job : "SALESMAN", hiredate : ISODate("1981-02-22"), sal : 1250, comm : 500, deptno : 30 })
db.emp.insert({empno:7566 , ename : "JONES", job : "MANAGER", hiredate : ISODate("1981-04-02"), sal : 2975, deptno : 20 })
db.emp.insert({empno:7654 , ename : "MARTIN", job : "SALESMAN", hiredate : ISODate("1981-09-28"), sal : 1250, comm : 1400, deptno : 30 })
db.emp.insert({empno:7698 , ename : "BLAKE", job : "MANAGER", hiredate : ISODate("1981-05-01"), sal : 2850, deptno : 30 })
db.emp.insert({empno:7782 , ename : "CLARK", job : "MANAGER", hiredate : ISODate("1981-06-09"), sal : 2450, deptno : 10 })
db.emp.insert({empno:7788 , ename : "SCOTT", job : "ANALYST", hiredate : ISODate("1987-06-13"), sal : 3000, deptno : 20 })
db.emp.insert({empno:7839 , ename : "PRESIDENT", job : "CEO", hiredate : ISODate("1981-11-17"), sal : 5000, deptno : 10 })
db.emp.insert({empno:7844 , ename : "TURNER", job : "SALESMAN", hiredate : ISODate("1981-09-08"), sal : 1500, deptno : 30 }) 
db.emp.insert({empno:7876 , ename : "ADAMS", job : "CLERK", hiredate : ISODate("1987-06-13"), sal : 1100, deptno : 20 })
db.emp.insert({empno:7900 , ename : "JAMES", job : "CLERK", hiredate : ISODate("1981-12-03"), sal : 950, deptno : 30 })
db.emp.insert({empno:7902 , ename : "FORD", job : "ANALYST", hiredate : ISODate("1981-12-03"), sal : 3000, deptno : 20 })
db.emp.insert({empno:7934 , ename : "CLERK", job : "CLERK", hiredate : ISODate("1982-01-23"), sal : 1300, deptno : 10 })

db.emp.find()

db.emp.aggregate (
{ $match : { empno : 7782 } } ,
{ $project : {
    _id : 0 , empno : 1 , ename : 1 ,
    year : { $year : "$hiredate" } ,
    month : { $month : "$hiredate" } ,
    week : { $week : "$hiredate" } ,
    hour : { $hour : "$hiredate" } ,
    minute : { $minute : "$hiredate" } ,
    second : { $second : "$hiredate" } ,
    dayOfYear : { $dayOfYear : "$hiredate" } ,
    dayOfMonth : { $dayOfMonth : "$hiredate" } ,
    dayOfWeek : { $dayOfWeek : "$hiredate" }
 } } )

{ "empno" : 7782, "ename" : "CLARK", "year" : 1981, "month" : 6, "week" : 23, "hour" : 0, "minute" : 0, "second" : 0, "dayOfYear" : 160, "dayOfMonth" : 9, "dayOfWeek" : 3 }

댓글 없음:

댓글 쓰기