🏄‍♂[Do it! Pandas] 02 판다스 시작하기

太热了 2022. 3. 30. 23:16

In [67]:

from IPython.core.display import display, HTML

display(HTML("<style> .container{width:90% !important;}</style>"))

갭마인더 데이터 집합 불러오기¶

In [1]:

import pandas as pd # 판다스 기능 사용 
df = pd.read_csv('C:/Users/USER/Doit/data/gapminder.tsv', sep = '\t') # 탭으로 구분되어 있는 데이터 불러오기

불러온 데이터 집합 살펴보기¶

In [2]:

# 자료형 알아보기 
# series, dataframe 자료형 사용 = 데이터 프레임은 엑셀 시트와 비슷, 시리즈는 열 1개와 비슷, 데이터 프레임은 시리즈들이 각 요소가 되는 딕셔너리

# read_csv 메서드는 데이터프레임이라는 자료형으로 반환해준다. 
# 여러 메서드가 미리 정의되어 있음 

print(df.head())

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

In [3]:

print(type(df))

<class 'pandas.core.frame.DataFrame'>

In [6]:

# shape는 크기에 대한 정보
#데이터프레임은 자신이 가지고 있는 데이터의 행과 열의 크기에 대한 정보를 shape 라는 속성에 저장하고 있음. 

print(df.shape)

(1704, 6)

In [7]:

# columns 속성 사용시 데이터 프레임의 열 이름 확인 가능 

print(df.columns)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [8]:

# dtypes, info 메서드를 통해 데이터프레임의 자료형 확인 가능 

print(df.dtypes)

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [9]:

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None

열 단위로 데이터 추출하기¶

In [11]:

# 1개열 추출시 시리즈, 2개 이상의 열 추출시 데이터프레임 

country_df = df['country']
print(country_df) #1개의 열 추출이니까 시리즈 자료형태
print(type(country_df))

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object
<class 'pandas.core.series.Series'>

In [12]:

print(country_df.head())

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [13]:

print(country_df.tail())

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [14]:

# 리스트에 열 이름을 전달하면 여러 개의 열을 한 번에 추출 가능 

subset = df[['country', 'continent', 'year']]
print(type(subset))

<class 'pandas.core.frame.DataFrame'>

In [15]:

print(subset.head())

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

In [16]:

print(subset.tail())

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007

loc 속성으로 행 단위 데이터 추출하기¶

In [17]:

# loc - 인덱스를 기준으로 행 데이터 추출 
# iloc - 행 번호를 기준으로 행 데이터 추출 

# 행 번호는 데이터의 순서를 따라가기 때문에 정수만으로 데이터를 조회하거나 추출할 수 있으며 실제 데이터프레임에서는 확인 할 수 없는 값

In [18]:

print(df.loc[0])

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [23]:

# 데이터프레임의 마지막 행 데이터 추출? - 마지막 행 인덱스 알기!

df.shape[0] # 행의 길이 

Out[23]:

In [24]:

# 마지막 행 인덱스는 행의 크기 - 1 
number_of_rows = df.shape[0]
last_row_index = number_of_rows -1 
print(df.loc[last_row_index])

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [25]:

# tail 메서드 활용하기 

print(df.tail(n=1))

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298

In [26]:

print(df.loc[[0,99,999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

tail과 loc는 조금 달라요!¶

In [28]:

# loc로 반환한 데이터 자료형은 시리즈, tail 메서드로 반환한 데이터 자료형은 데이터프레임 

subset_loc = df.loc[0]
subset_tail = df.tail(n=1)

print(type(subset_loc))
print(type(subset_tail))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>

iloc 속성으로 행 단위 데이터 추출하기¶

In [31]:

print(df.iloc[1])

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object

In [32]:

print(df.iloc[99])

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object

In [33]:

print(df.iloc[-1])

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [34]:

print(df.iloc[[0,99,999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

파이썬 슬라이싱 구문을 조합하여 원하는 데이터 추출하기¶

In [35]:

# loc 속성의 열 지정값에 정수 리스트 전달시 오류 
# iloc 속성의 열 지정값에 문자열 리스트 전달시 오류 

subset = df.loc[:, ['year','pop']]
print(subset.head())

   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460

In [37]:

subset1 = df.iloc[:,[2,4,-1]]
print(subset1.head())

   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106

iloc 속성과 range 메서드로 원하는 데이터 추출하기¶

In [38]:

# range 메소드는 지정한 구간의 정수 리스트 반환 - 제너레이터
# iloc 속성의 열 지정값에는 정수 리스트 전달
# 활용하기 !!

In [39]:

small_range = list(range(5))
print(small_range)

[0, 1, 2, 3, 4]

In [40]:

print(type(small_range))

<class 'list'>

In [42]:

subset = df.iloc[:,small_range]
print(subset.head())

       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460

In [43]:

small_range = list(range(3,6))
print(small_range)

[3, 4, 5]

In [44]:

subset = df.iloc[:,small_range]
print(subset.head())

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106

In [46]:

small_range = list(range(0,6,2))
subset = df.iloc[:,small_range]
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

열 지정값에 파이썬 슬라이싱을 사용하여 원하는 데이터 추출하기¶

In [49]:

# range 보다 더 간편하게 사용할 수 있는 파이썬 슬라이싱 구문 선호 - 리스트로 변환하는 과정 필요 없음

subset = df.iloc[:, :3]
print(subset.head())

# small_range = list(range(3))
# subset = df.iloc[: , small_range]

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

In [50]:

subset = df.iloc[:, 0:6:2]
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

loc, iloc 자유자재로 사용하기¶

In [51]:

print(df.iloc[[0,99,999],[0,3,5]])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130

In [53]:

# iloc 속성으로 열 지정값을 정수로 전달해 줄수도 있지만, 나중에 어떤 데이터인지 구별 안감 
# 그래서 보통은 loc 속성을 이용해 열 지정값을 열 이름으로 지정해 준다. 

print(df.loc[[0,99,999],['country','lifeExp','gdpPercap']])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130

In [55]:

# 인덱스가 10인 행부터 13인 행의 특정 열 추출 

print(df.loc[10:13, ['country','lifeExp','gdpPercap']]) # 특정열을 []로 묶은게 아니라면 [] 두개 안해도됨

        country  lifeExp    gdpPercap
10  Afghanistan   42.129   726.734055
11  Afghanistan   43.828   974.580338
12      Albania   55.230  1601.056136
13      Albania   59.280  1942.284244

그룹화한 데이터의 평균 구하기¶

In [56]:

print(df.groupby('year')['lifeExp'].mean())

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [57]:

grouped_year_df = df.groupby('year')
print(type(grouped_year_df))

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>

In [58]:

print(grouped_year_df)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002266D69AC10>

In [59]:

grouped_year_df_lifeExp = grouped_year_df['lifeExp']
print(type(grouped_year_df_lifeExp))

<class 'pandas.core.groupby.generic.SeriesGroupBy'>

In [61]:

mean_lifeExp_by_year = grouped_year_df_lifeExp.mean()
print(type(mean_lifeExp_by_year))

<class 'pandas.core.series.Series'>

In [62]:

multi_group_var = df.groupby(['year','continent'])[['lifeExp','gdpPercap']].mean()
print(multi_group_var)

                  lifeExp     gdpPercap
year continent                         
1952 Africa     39.135500   1252.572466
     Americas   53.279840   4079.062552
     Asia       46.314394   5195.484004
     Europe     64.408500   5661.057435
     Oceania    69.255000  10298.085650
1957 Africa     41.266346   1385.236062
     Americas   55.960280   4616.043733
     Asia       49.318544   5787.732940
     Europe     66.703067   6963.012816
     Oceania    70.295000  11598.522455
1962 Africa     43.319442   1598.078825
     Americas   58.398760   4901.541870
     Asia       51.563223   5729.369625
     Europe     68.539233   8365.486814
     Oceania    71.085000  12696.452430
1967 Africa     45.334538   2050.363801
     Americas   60.410920   5668.253496
     Asia       54.663640   5971.173374
     Europe     69.737600  10143.823757
     Oceania    71.310000  14495.021790
1972 Africa     47.450942   2339.615674
     Americas   62.394920   6491.334139
     Asia       57.319269   8187.468699
     Europe     70.775033  12479.575246
     Oceania    71.910000  16417.333380
1977 Africa     49.580423   2585.938508
     Americas   64.391560   7352.007126
     Asia       59.610556   7791.314020
     Europe     71.937767  14283.979110
     Oceania    72.855000  17283.957605
1982 Africa     51.592865   2481.592960
     Americas   66.228840   7506.737088
     Asia       62.617939   7434.135157
     Europe     72.806400  15617.896551
     Oceania    74.290000  18554.709840
1987 Africa     53.344788   2282.668991
     Americas   68.090720   7793.400261
     Asia       64.851182   7608.226508
     Europe     73.642167  17214.310727
     Oceania    75.320000  20448.040160
1992 Africa     53.629577   2281.810333
     Americas   69.568360   8044.934406
     Asia       66.537212   8639.690248
     Europe     74.440100  17061.568084
     Oceania    76.945000  20894.045885
1997 Africa     53.598269   2378.759555
     Americas   71.150480   8889.300863
     Asia       68.020515   9834.093295
     Europe     75.505167  19076.781802
     Oceania    78.190000  24024.175170
2002 Africa     53.325231   2599.385159
     Americas   72.422040   9287.677107
     Asia       69.233879  10174.090397
     Europe     76.700600  21711.732422
     Oceania    79.740000  26938.778040
2007 Africa     54.806038   3089.032605
     Americas   73.608120  11003.031625
     Asia       70.728485  12473.026870
     Europe     77.648600  25054.481636
     Oceania    80.719500  29810.188275

그룹화한 데이터의 개수 세어보기¶

In [63]:

# 그룹화한 데이터의 개수 = 빈도수 - nunique 메서드 사용

print(df.groupby('continent')['country'].nunique())

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

그래프 만들기¶

In [64]:

%matplotlib inline
import matplotlib.pyplot as plt

In [65]:

global_yearly_life_expectancy = df.groupby('year')['lifeExp'].mean()
print(global_yearly_life_expectancy)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [66]:

global_yearly_life_expectancy.plot()

Out[66]:

<AxesSubplot:xlabel='year'>

저작자표시 (새창열림)