5-2. pandas(DataFrame)-likeSQL
import pandas as pd
df_emp = pd.read_csv('emp.csv', encoding='utf-8')
df_dept = pd.read_csv('dept.csv', encoding='utf-8')
結合(5-1で記載済)
SQL
SELECT e.*, d.dname, d.loc
FROM emp e INNER JOIN dept d ON (e.deptno = d.deptno)
pandas
pd.merge(df_emp, df_dept, how='inner', right_on='deptno', left_on='dept')
集計(5-1で記載済)
SQL
SELECT dept, COUNT(sal), MAX(sal), MIN(sal), AVG(sal), STD(sal)
FROM emp GROUP BY dept;
pandas
df_emp.groupby('dept')[['sal']].agg(['count', 'max', 'min', 'mean', 'std'])
集計(dept別でsalが最大のempno)
SQL
WITH grp AS (
SELECT deptno, MAX(sal) as maxsal
FROM emp GROUP BY deptno
)
SELECT e.deptno, empno, sal
FROM emp e INNER JOIN grp g
ON (e.deptno = g.deptno
AND e.sal = g.maxsal)
ORDER BY e.deptno, empno;
pandas(deptでグループ化して最大のsalの金額でjoin)
※同金額が2件ある場合は2件とも取得される
pd.merge(df_emp, df_emp.groupby('dept')[['sal']].agg(max).reset_index() \
,how='inner', right_on='dept', left_on='dept') \
.query('sal_x == sal_y') \
.sort_values('dept')
==> empno ename job manager hiredate sal_x comm dept sal_y
==> 12 7839 藤原太郎 PRESIDENT NaN 1981/11/17 5000 NaN 10.0 5000
==> 2 7788 中村健一 ANALYST 7566.0 1987/04/19 3000 NaN 20.0 3000
==> 4 7902 松本亮太 ANALYST 7566.0 1981/12/03 3000 NaN 20.0 3000
==> 8 7698 伊藤誠 MANAGER 7839.0 1981/05/01 2850 NaN 30.0 2850
pandas(deptでグループ化しsalのランク1位のみを取得)
※同金額が2件ある場合、そのうち1件しか取得されない ※deptでjoinしないのでdeptがnullの行も抽出される
pd.merge(df_emp.reset_index(), df_emp.groupby('dept')[['sal']].rank(ascending=False, method='first') \
.reset_index(), how='inner', right_on='index', left_on='index') \
.query('sal_y == 1.0') \
.sort_values('dept')
==> index empno ename job manager hiredate sal_x comm dept sal_y
==> 8 8 7839 藤原太郎 PRESIDENT NaN 1981/11/17 5000 NaN 10.0 1.0
==> 7 7 7788 中村健一 ANALYST 7566.0 1987/04/19 3000 NaN 20.0 1.0
==> 5 5 7698 伊藤誠 MANAGER 7839.0 1981/05/01 2850 NaN 30.0 1.0
pandas(先にdept,sal毎にソートし、dept毎の最上位1件だけを取得)
df_emp.sort_values(by=['dept','sal'], ascending=[True,False]).groupby('dept').head(1)
==> empno ename job manager hiredate sal comm dept
==> 8 7839 藤原太郎 PRESIDENT NaN 1981/11/17 5000 NaN 10.0
==> 7 7788 中村健一 ANALYST 7566.0 1987/04/19 3000 NaN 20.0
==> 5 7698 伊藤誠 MANAGER 7839.0 1981/05/01 2850 NaN 30.0
==> 14 8001 小林政男 SALESMAN 7698.0 1985/03/17 1300 NaN NaN
集計(dept別でsalが最大の行名(行index))
SQL(rowidを取得)
WITH grp AS (
SELECT dept, MAX(sal) as maxsal
FROM emp GROUP BY dept
)
SELECT rowid
FROM emp e INNER JOIN grp g
ON (e.deptno = g.deptno
AND e.sal = g.maxsal)
ORDER BY e.deptno, empno;
pandas(dept別のindex)
df_emp.groupby('dept')['sal'].idxmax()
==> dept
==> 10.0 8
==> 20.0 7
==> 30.0 5
==> Name: sal, dtype: int64
※複数件ある場合に、複数のindexを取得したい場合は、deptでgroupbyし、そのmax()と一致するdept,salを持ったindexを取得する
pd.merge(
df_emp[['dept','sal']].reset_index(),
df_emp.groupby('dept')['sal'].max().reset_index(),
how='inner', on=['dept','sal']) \
.loc[:,['dept','index']].sort_values('dept').reset_index(drop=True)
==> dept index
==> 0 10.0 8
==> 1 20.0 7
==> 2 20.0 12
==> 3 30.0 5
縦結合(UNION)
SQL
SELECT * FROM emp WHERE dept = 10
UNION
SELECT * FROM emp WHERE dept = 20;
pandas
pd.concat([df_emp[df_emp['dept']==10], df_emp[df_emp['dept']==20]])
もしくは、
pd.concat([df_emp.query('dept==10'), df_emp.query('dept==20')])
※concatの引数はiterableなオブジェクト(リストやタプル)で指定する
==> empno ename job manager hiredate sal comm dept
==> 6 7782 山本尚人 MANAGER 7839.0 1981/06/09 2450 NaN 10.0
==> 8 7839 藤原太郎 PRESIDENT NaN 1981/11/17 5000 NaN 10.0
==> 13 7934 木村大輝 CLERK 7782.0 1982/01/23 1300 NaN 10.0
==> 0 7369 佐藤一郎 CLERK 7902.0 1980/12/17 800 NaN 20.0
==> 3 7566 田中義男 MANAGER 7839.0 1981/04/02 2975 NaN 20.0
==> 7 7788 中村健一 ANALYST 7566.0 1987/04/19 3000 NaN 20.0
==> 10 7876 吉田翔太 CLERK 7788.0 1987/05/23 1100 NaN 20.0
==> 12 7902 松本亮太 ANALYST 7566.0 1981/12/03 3000 NaN 20.0
厳密にUNIONの動作(重複行は削除)に合わせたいときは
pd.concat([df_emp.query('dept==10'), df_emp.query('dept==20')]).drop_duplicates()
とする(この例では重複は発生しないので変わりないが)