Class 11: Cleaning Data

import pandas as pd
# %load http://drsmb.co/310
data_url = 'https://github.com/rhodyprog4ds/inclass-data/raw/main/ca_dds_summary.xlsx'

Recall from Wednesday

ca_dds_df = pd.read_excel(data_url, header=list(range(3)))
ca_dds_df
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-4542499f505a> in <module>
----> 1 ca_dds_df = pd.read_excel(data_url, header=list(range(3)))
      2 ca_dds_df

/opt/hostedtoolcache/Python/3.7.10/x64/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    297                 )
    298                 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 299             return func(*args, **kwargs)
    300 
    301         return wrapper

/opt/hostedtoolcache/Python/3.7.10/x64/lib/python3.7/site-packages/pandas/io/excel/_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
    334     if not isinstance(io, ExcelFile):
    335         should_close = True
--> 336         io = ExcelFile(io, storage_options=storage_options, engine=engine)
    337     elif engine and engine != io.engine:
    338         raise ValueError(

/opt/hostedtoolcache/Python/3.7.10/x64/lib/python3.7/site-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine, storage_options)
   1101             if ext != "xls" and xlrd_version >= "2":
   1102                 raise ValueError(
-> 1103                     f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
   1104                     f"only the xls format is supported. Install openpyxl instead."
   1105                 )

ValueError: Your version of xlrd is 2.0.1. In xlrd >= 2.0, only the xls format is supported. Install openpyxl instead.
print(ca_dds_df)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-e640822e617f> in <module>
----> 1 print(ca_dds_df)

NameError: name 'ca_dds_df' is not defined

First we can look at the index

ca_dds_df.index
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-f24e6493175a> in <module>
----> 1 ca_dds_df.index

NameError: name 'ca_dds_df' is not defined

then the first’s name

ca_dds_df.columns[0]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-6dcb8c4e6570> in <module>
----> 1 ca_dds_df.columns[0]

NameError: name 'ca_dds_df' is not defined

For this we want to se that as the index

ca_dds_df.set_index(ca_dds_df.columns[0],inplace=True)

ca_dds_df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-894c600386fd> in <module>
----> 1 ca_dds_df.set_index(ca_dds_df.columns[0],inplace=True)
      2 
      3 ca_dds_df

NameError: name 'ca_dds_df' is not defined

We can get rid of rows that are all nan, that have no data in them, using dropna with how='all'

ca_dds_df.dropna(how='all',inplace=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-b9f52e336c27> in <module>
----> 1 ca_dds_df.dropna(how='all',inplace=True)

NameError: name 'ca_dds_df' is not defined
ca_dds_df.index
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-f24e6493175a> in <module>
----> 1 ca_dds_df.index

NameError: name 'ca_dds_df' is not defined

We want to still rename this, we we’ll fill that in manually

ca_dds_df.index.rename('Age Cohort',inplace=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-fcfa8204b94f> in <module>
----> 1 ca_dds_df.index.rename('Age Cohort',inplace=True)

NameError: name 'ca_dds_df' is not defined

Now we want to unstack,

ca_dds_df.unstack(level= [0,1])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-06d6e6545889> in <module>
----> 1 ca_dds_df.unstack(level= [0,1])

NameError: name 'ca_dds_df' is not defined

but this doesn’t quite get what we want, instead we can

We can rotate (transpose .T )this, then unstack and rotate it back to get the outcome we want here.

ca_dds_df.T.unstack(level= [0,1]).T.reset_index()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-c8bbbfe0591e> in <module>
----> 1 ca_dds_df.T.unstack(level= [0,1]).T.reset_index()

NameError: name 'ca_dds_df' is not defined

This looks good, so we’ll save it to a new DataFrame

ca_dds_clean = ca_dds_df.T.unstack(level= [0,1]).T.reset_index()
ca_dds_clean.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-eff451103ff6> in <module>
----> 1 ca_dds_clean = ca_dds_df.T.unstack(level= [0,1]).T.reset_index()
      2 ca_dds_clean.head()

NameError: name 'ca_dds_df' is not defined

We want to rename the two with nondescript names, but the index is not mutable

ca_dds_clean.columns[1] = 'Race'
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-f771ccd0e865> in <module>
----> 1 ca_dds_clean.columns[1] = 'Race'

NameError: name 'ca_dds_clean' is not defined
type(ca_dds_clean.columns)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-0372e536c94d> in <module>
----> 1 type(ca_dds_clean.columns)

NameError: name 'ca_dds_clean' is not defined

Instead we can use rename with a dictionary, this way is good also because it’s explicit and readable.

ca_dds_clean.rename(columns= {'level_1':'Race', 'level_2':'Gender'},inplace=True)
ca_dds_clean.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-e447bbfeee5e> in <module>
----> 1 ca_dds_clean.rename(columns= {'level_1':'Race', 'level_2':'Gender'},inplace=True)
      2 ca_dds_clean.head()

NameError: name 'ca_dds_clean' is not defined
ca_dds_clean['count'][0]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-f579f06c67b3> in <module>
----> 1 ca_dds_clean['count'][0]

NameError: name 'ca_dds_clean' is not defined
ca_dds_clean.Gender[0]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-2e64fd505c34> in <module>
----> 1 ca_dds_clean.Gender[0]

NameError: name 'ca_dds_clean' is not defined
ca_dds_clean.Gender[0].lower()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-693531faa940> in <module>
----> 1 ca_dds_clean.Gender[0].lower()

NameError: name 'ca_dds_clean' is not defined
clean_cols = {c:c.lower().replace(' ','_') for c in ca_dds_clean.columns}
ca_dds_clean.rename(columns=clean_cols,inplace=True)
ca_dds_clean.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-bbc80f90b452> in <module>
----> 1 clean_cols = {c:c.lower().replace(' ','_') for c in ca_dds_clean.columns}
      2 ca_dds_clean.rename(columns=clean_cols,inplace=True)
      3 ca_dds_clean.head()

NameError: name 'ca_dds_clean' is not defined
ca_dds_clean.fillna('*')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-3bf797621552> in <module>
----> 1 ca_dds_clean.fillna('*')

NameError: name 'ca_dds_clean' is not defined

Questions after class

What happens when we use dropna?

We can see the documentation it drops the whole row or column.

import numpy as np #
test_df = pd.DataFrame(data = [[1,3,4,5],[2 ,6, np.na]])
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-482bb49100aa> in <module>
      1 import numpy as np #
----> 2 test_df = pd.DataFrame(data = [[1,3,4,5],[2 ,6, np.na]])

/opt/hostedtoolcache/Python/3.7.10/x64/lib/python3.7/site-packages/numpy/__init__.py in __getattr__(attr)
    302 
    303             raise AttributeError("module {!r} has no attribute "
--> 304                                  "{!r}".format(__name__, attr))
    305 
    306         def __dir__():

AttributeError: module 'numpy' has no attribute 'na'