7. Tidy Data and Reshaping Datasets#

import pandas as pd
import seaborn as sns

sns.set_theme(palette='colorblind',font_scale=2)
url_base = 'https://raw.githubusercontent.com/rhodyprog4ds/rhodyds/main/data/'

datasets = ['study_a.csv','study_b.csv','study_c.csv']
list_of_df = [pd.read_csv(url_base + dataset,na_values='-') for dataset in datasets]
list_of_df[0]
name treatmenta treatmentb
0 John Smith NaN 2
1 Jane Doe 16.0 11
2 Mary Johnson 3.0 1
list_of_df[1]
intervention John Smith Jane Doe Mary Johnson
0 treatmenta NaN 16 3
1 treatmentb 2.0 11 1
list_of_df[2]
person treatment result
0 John Smith a NaN
1 Jane Doe a 16.0
2 Mary Johnson a 3.0
3 John Smith b 2.0
4 Jane Doe b 11.0
5 Mary Johnson b 1.0
list_of_df[2].mean()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1680, in _ensure_numeric(x)
   1679 try:
-> 1680     x = x.astype(np.complex128)
   1681 except (TypeError, ValueError):

ValueError: complex() arg is a malformed string

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1683, in _ensure_numeric(x)
   1682 try:
-> 1683     x = x.astype(np.float64)
   1684 except ValueError as err:
   1685     # GH#29941 we get here with object arrays containing strs

ValueError: could not convert string to float: 'John SmithJane DoeMary JohnsonJohn SmithJane DoeMary Johnson'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[7], line 1
----> 1 list_of_df[2].mean()

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11563, in NDFrame._add_numeric_operations.<locals>.mean(self, axis, skipna, numeric_only, **kwargs)
  11546 @doc(
  11547     _num_doc,
  11548     desc="Return the mean of the values over the requested axis.",
   (...)
  11561     **kwargs,
  11562 ):
> 11563     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11208, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11201 def mean(
  11202     self,
  11203     axis: Axis | None = 0,
   (...)
  11206     **kwargs,
  11207 ) -> Series | float:
> 11208     return self._stat_function(
  11209         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11210     )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11165, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11161     nv.validate_stat_func((), kwargs, fname=name)
  11163 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11165 return self._reduce(
  11166     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11167 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/frame.py:10519, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
  10515     df = df.T
  10517 # After possibly _get_data and transposing, we are now in the
  10518 #  simple case where we can use BlockManager.reduce
> 10519 res = df._mgr.reduce(blk_func)
  10520 out = df._constructor(res).iloc[0]
  10521 if out_dtype is not None:

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/managers.py:1534, in BlockManager.reduce(self, func)
   1532 res_blocks: list[Block] = []
   1533 for blk in self.blocks:
-> 1534     nbs = blk.reduce(func)
   1535     res_blocks.extend(nbs)
   1537 index = Index([None])  # placeholder

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/blocks.py:339, in Block.reduce(self, func)
    333 @final
    334 def reduce(self, func) -> list[Block]:
    335     # We will apply the function and reshape the result into a single-row
    336     #  Block with the same mgr_locs; squeezing will be done at a higher level
    337     assert self.ndim == 2
--> 339     result = func(self.values)
    341     if self.values.ndim == 1:
    342         # TODO(EA2D): special case not needed with 2D EAs
    343         res_values = np.array([[result]])

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/frame.py:10482, in DataFrame._reduce.<locals>.blk_func(values, axis)
  10480     return values._reduce(name, skipna=skipna, **kwds)
  10481 else:
> 10482     return op(values, axis=axis, skipna=skipna, **kwds)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
     94 try:
     95     with np.errstate(invalid="ignore"):
---> 96         return f(*args, **kwargs)
     97 except ValueError as e:
     98     # we want to transform an object array
     99     # ValueError message to the more typical TypeError
    100     # e.g. this is normally a disallowed function on
    101     # object arrays that contain strings
    102     if is_object_dtype(args[0]):

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:158, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    156         result = alt(values, axis=axis, skipna=skipna, **kwds)
    157 else:
--> 158     result = alt(values, axis=axis, skipna=skipna, **kwds)
    160 return result

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:421, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    418 if datetimelike and mask is None:
    419     mask = isna(values)
--> 421 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    423 if datetimelike:
    424     result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:727, in nanmean(values, axis, skipna, mask)
    724     dtype_count = dtype
    726 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 727 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
    729 if axis is not None and getattr(the_sum, "ndim", False):
    730     count = cast(np.ndarray, count)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1686, in _ensure_numeric(x)
   1683         x = x.astype(np.float64)
   1684     except ValueError as err:
   1685         # GH#29941 we get here with object arrays containing strs
-> 1686         raise TypeError(f"Could not convert {x} to numeric") from err
   1687 else:
   1688     if not np.any(np.imag(x)):

TypeError: Could not convert ['John SmithJane DoeMary JohnsonJohn SmithJane DoeMary Johnson' 'aaabbb'] to numeric
sum([16,3,2,11,1])/5
6.6
sum([16,3,2,11,1,0])/6
5.5
list_of_df[2].groupby('treatment').mean()
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1490, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1489 try:
-> 1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   1493         how,
   1494         axis=data.ndim - 1,
   1495         min_count=min_count,
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:959, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
    958 ngroups = self.ngroups
--> 959 return cy_op.cython_operation(
    960     values=values,
    961     axis=axis,
    962     min_count=min_count,
    963     comp_ids=ids,
    964     ngroups=ngroups,
    965     **kwargs,
    966 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:657, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
    649     return self._ea_wrap_cython_operation(
    650         values,
    651         min_count=min_count,
   (...)
    654         **kwargs,
    655     )
--> 657 return self._cython_op_ndim_compat(
    658     values,
    659     min_count=min_count,
    660     ngroups=ngroups,
    661     comp_ids=comp_ids,
    662     mask=None,
    663     **kwargs,
    664 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:497, in WrappedCythonOp._cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    495     return res.T
--> 497 return self._call_cython_op(
    498     values,
    499     min_count=min_count,
    500     ngroups=ngroups,
    501     comp_ids=comp_ids,
    502     mask=mask,
    503     result_mask=result_mask,
    504     **kwargs,
    505 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:541, in WrappedCythonOp._call_cython_op(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    540 out_shape = self._get_output_shape(ngroups, values)
--> 541 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
    542 values = self._get_cython_vals(values)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:173, in WrappedCythonOp._get_cython_function(cls, kind, how, dtype, is_numeric)
    171 if "object" not in f.__signatures__:
    172     # raise NotImplementedError here rather than TypeError later
--> 173     raise NotImplementedError(
    174         f"function is not implemented for this dtype: "
    175         f"[how->{how},dtype->{dtype_str}]"
    176     )
    177 return f

NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1692, in _ensure_numeric(x)
   1691 try:
-> 1692     x = float(x)
   1693 except (TypeError, ValueError):
   1694     # e.g. "1+1j" or "foo"

ValueError: could not convert string to float: 'John SmithJane DoeMary Johnson'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1696, in _ensure_numeric(x)
   1695 try:
-> 1696     x = complex(x)
   1697 except ValueError as err:
   1698     # e.g. "foo"

ValueError: complex() arg is a malformed string

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 list_of_df[2].groupby('treatment').mean()

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1855, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   1853     return self._numba_agg_general(sliding_mean, engine_kwargs)
   1854 else:
-> 1855     result = self._cython_agg_general(
   1856         "mean",
   1857         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   1858         numeric_only=numeric_only,
   1859     )
   1860     return result.__finalize__(self.obj, method="groupby")

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1507, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1503         result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505     return result
-> 1507 new_mgr = data.grouped_reduce(array_func)
   1508 res = self._wrap_agged_manager(new_mgr)
   1509 out = self._wrap_aggregated_output(res)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/managers.py:1503, in BlockManager.grouped_reduce(self, func)
   1499 if blk.is_object:
   1500     # split on object-dtype blocks bc some columns may raise
   1501     #  while others do not.
   1502     for sb in blk._split():
-> 1503         applied = sb.apply(func)
   1504         result_blocks = extend_blocks(applied, result_blocks)
   1505 else:

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/blocks.py:329, in Block.apply(self, func, **kwargs)
    323 @final
    324 def apply(self, func, **kwargs) -> list[Block]:
    325     """
    326     apply the function to my values; return a block if we are not
    327     one
    328     """
--> 329     result = func(self.values, **kwargs)
    331     return self._split_op_result(result)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1503, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   (...)
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?
-> 1503     result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505 return result

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1457, in GroupBy._agg_py_fallback(self, values, ndim, alt)
   1452     ser = df.iloc[:, 0]
   1454 # We do not get here with UDFs, so we know that our dtype
   1455 #  should always be preserved by the implemented aggregations
   1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1459 if isinstance(values, Categorical):
   1460     # Because we only get here with known dtype-preserving
   1461     #  reductions, we cast back to Categorical.
   1462     # TODO: if we ever get "rank" working, exclude it here.
   1463     res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:994, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    987 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
    988     # we can preserve a little bit more aggressively with EA dtype
    989     #  because maybe_cast_pointwise_result will do a try/except
    990     #  with _from_sequence.  NB we are assuming here that _from_sequence
    991     #  is sufficiently strict that it casts appropriately.
    992     preserve_dtype = True
--> 994 result = self._aggregate_series_pure_python(obj, func)
    996 npvalues = lib.maybe_convert_objects(result, try_float=False)
    997 if preserve_dtype:

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:1015, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
   1012 splitter = self._get_splitter(obj, axis=0)
   1014 for i, group in enumerate(splitter):
-> 1015     res = func(group)
   1016     res = libreduction.extract_result(res)
   1018     if not initialized:
   1019         # We only do this validation on the first iteration

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1857, in GroupBy.mean.<locals>.<lambda>(x)
   1853     return self._numba_agg_general(sliding_mean, engine_kwargs)
   1854 else:
   1855     result = self._cython_agg_general(
   1856         "mean",
-> 1857         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   1858         numeric_only=numeric_only,
   1859     )
   1860     return result.__finalize__(self.obj, method="groupby")

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11563, in NDFrame._add_numeric_operations.<locals>.mean(self, axis, skipna, numeric_only, **kwargs)
  11546 @doc(
  11547     _num_doc,
  11548     desc="Return the mean of the values over the requested axis.",
   (...)
  11561     **kwargs,
  11562 ):
> 11563     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11208, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11201 def mean(
  11202     self,
  11203     axis: Axis | None = 0,
   (...)
  11206     **kwargs,
  11207 ) -> Series | float:
> 11208     return self._stat_function(
  11209         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11210     )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11165, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11161     nv.validate_stat_func((), kwargs, fname=name)
  11163 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11165 return self._reduce(
  11166     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11167 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/series.py:4671, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4666     raise TypeError(
   4667         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   4668         "with non-numeric dtypes."
   4669     )
   4670 with np.errstate(all="ignore"):
-> 4671     return op(delegate, skipna=skipna, **kwds)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
     94 try:
     95     with np.errstate(invalid="ignore"):
---> 96         return f(*args, **kwargs)
     97 except ValueError as e:
     98     # we want to transform an object array
     99     # ValueError message to the more typical TypeError
    100     # e.g. this is normally a disallowed function on
    101     # object arrays that contain strings
    102     if is_object_dtype(args[0]):

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:158, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    156         result = alt(values, axis=axis, skipna=skipna, **kwds)
    157 else:
--> 158     result = alt(values, axis=axis, skipna=skipna, **kwds)
    160 return result

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:421, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    418 if datetimelike and mask is None:
    419     mask = isna(values)
--> 421 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    423 if datetimelike:
    424     result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:727, in nanmean(values, axis, skipna, mask)
    724     dtype_count = dtype
    726 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 727 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
    729 if axis is not None and getattr(the_sum, "ndim", False):
    730     count = cast(np.ndarray, count)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1699, in _ensure_numeric(x)
   1696             x = complex(x)
   1697         except ValueError as err:
   1698             # e.g. "foo"
-> 1699             raise TypeError(f"Could not convert {x} to numeric") from err
   1700 return x

TypeError: Could not convert John SmithJane DoeMary Johnson to numeric
list_of_df[2].groupby('person').mean()
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1490, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1489 try:
-> 1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   1493         how,
   1494         axis=data.ndim - 1,
   1495         min_count=min_count,
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:959, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
    958 ngroups = self.ngroups
--> 959 return cy_op.cython_operation(
    960     values=values,
    961     axis=axis,
    962     min_count=min_count,
    963     comp_ids=ids,
    964     ngroups=ngroups,
    965     **kwargs,
    966 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:657, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
    649     return self._ea_wrap_cython_operation(
    650         values,
    651         min_count=min_count,
   (...)
    654         **kwargs,
    655     )
--> 657 return self._cython_op_ndim_compat(
    658     values,
    659     min_count=min_count,
    660     ngroups=ngroups,
    661     comp_ids=comp_ids,
    662     mask=None,
    663     **kwargs,
    664 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:497, in WrappedCythonOp._cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    495     return res.T
--> 497 return self._call_cython_op(
    498     values,
    499     min_count=min_count,
    500     ngroups=ngroups,
    501     comp_ids=comp_ids,
    502     mask=mask,
    503     result_mask=result_mask,
    504     **kwargs,
    505 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:541, in WrappedCythonOp._call_cython_op(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    540 out_shape = self._get_output_shape(ngroups, values)
--> 541 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
    542 values = self._get_cython_vals(values)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:173, in WrappedCythonOp._get_cython_function(cls, kind, how, dtype, is_numeric)
    171 if "object" not in f.__signatures__:
    172     # raise NotImplementedError here rather than TypeError later
--> 173     raise NotImplementedError(
    174         f"function is not implemented for this dtype: "
    175         f"[how->{how},dtype->{dtype_str}]"
    176     )
    177 return f

NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1692, in _ensure_numeric(x)
   1691 try:
-> 1692     x = float(x)
   1693 except (TypeError, ValueError):
   1694     # e.g. "1+1j" or "foo"

ValueError: could not convert string to float: 'ab'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1696, in _ensure_numeric(x)
   1695 try:
-> 1696     x = complex(x)
   1697 except ValueError as err:
   1698     # e.g. "foo"

ValueError: complex() arg is a malformed string

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 list_of_df[2].groupby('person').mean()

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1855, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   1853     return self._numba_agg_general(sliding_mean, engine_kwargs)
   1854 else:
-> 1855     result = self._cython_agg_general(
   1856         "mean",
   1857         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   1858         numeric_only=numeric_only,
   1859     )
   1860     return result.__finalize__(self.obj, method="groupby")

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1507, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1503         result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505     return result
-> 1507 new_mgr = data.grouped_reduce(array_func)
   1508 res = self._wrap_agged_manager(new_mgr)
   1509 out = self._wrap_aggregated_output(res)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/managers.py:1503, in BlockManager.grouped_reduce(self, func)
   1499 if blk.is_object:
   1500     # split on object-dtype blocks bc some columns may raise
   1501     #  while others do not.
   1502     for sb in blk._split():
-> 1503         applied = sb.apply(func)
   1504         result_blocks = extend_blocks(applied, result_blocks)
   1505 else:

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/internals/blocks.py:329, in Block.apply(self, func, **kwargs)
    323 @final
    324 def apply(self, func, **kwargs) -> list[Block]:
    325     """
    326     apply the function to my values; return a block if we are not
    327     one
    328     """
--> 329     result = func(self.values, **kwargs)
    331     return self._split_op_result(result)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1503, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   (...)
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?
-> 1503     result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505 return result

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1457, in GroupBy._agg_py_fallback(self, values, ndim, alt)
   1452     ser = df.iloc[:, 0]
   1454 # We do not get here with UDFs, so we know that our dtype
   1455 #  should always be preserved by the implemented aggregations
   1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1459 if isinstance(values, Categorical):
   1460     # Because we only get here with known dtype-preserving
   1461     #  reductions, we cast back to Categorical.
   1462     # TODO: if we ever get "rank" working, exclude it here.
   1463     res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:994, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    987 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
    988     # we can preserve a little bit more aggressively with EA dtype
    989     #  because maybe_cast_pointwise_result will do a try/except
    990     #  with _from_sequence.  NB we are assuming here that _from_sequence
    991     #  is sufficiently strict that it casts appropriately.
    992     preserve_dtype = True
--> 994 result = self._aggregate_series_pure_python(obj, func)
    996 npvalues = lib.maybe_convert_objects(result, try_float=False)
    997 if preserve_dtype:

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/ops.py:1015, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
   1012 splitter = self._get_splitter(obj, axis=0)
   1014 for i, group in enumerate(splitter):
-> 1015     res = func(group)
   1016     res = libreduction.extract_result(res)
   1018     if not initialized:
   1019         # We only do this validation on the first iteration

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/groupby/groupby.py:1857, in GroupBy.mean.<locals>.<lambda>(x)
   1853     return self._numba_agg_general(sliding_mean, engine_kwargs)
   1854 else:
   1855     result = self._cython_agg_general(
   1856         "mean",
-> 1857         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   1858         numeric_only=numeric_only,
   1859     )
   1860     return result.__finalize__(self.obj, method="groupby")

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11563, in NDFrame._add_numeric_operations.<locals>.mean(self, axis, skipna, numeric_only, **kwargs)
  11546 @doc(
  11547     _num_doc,
  11548     desc="Return the mean of the values over the requested axis.",
   (...)
  11561     **kwargs,
  11562 ):
> 11563     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11208, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11201 def mean(
  11202     self,
  11203     axis: Axis | None = 0,
   (...)
  11206     **kwargs,
  11207 ) -> Series | float:
> 11208     return self._stat_function(
  11209         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11210     )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/generic.py:11165, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11161     nv.validate_stat_func((), kwargs, fname=name)
  11163 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11165 return self._reduce(
  11166     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11167 )

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/series.py:4671, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4666     raise TypeError(
   4667         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   4668         "with non-numeric dtypes."
   4669     )
   4670 with np.errstate(all="ignore"):
-> 4671     return op(delegate, skipna=skipna, **kwds)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:96, in disallow.__call__.<locals>._f(*args, **kwargs)
     94 try:
     95     with np.errstate(invalid="ignore"):
---> 96         return f(*args, **kwargs)
     97 except ValueError as e:
     98     # we want to transform an object array
     99     # ValueError message to the more typical TypeError
    100     # e.g. this is normally a disallowed function on
    101     # object arrays that contain strings
    102     if is_object_dtype(args[0]):

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:158, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    156         result = alt(values, axis=axis, skipna=skipna, **kwds)
    157 else:
--> 158     result = alt(values, axis=axis, skipna=skipna, **kwds)
    160 return result

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:421, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    418 if datetimelike and mask is None:
    419     mask = isna(values)
--> 421 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    423 if datetimelike:
    424     result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:727, in nanmean(values, axis, skipna, mask)
    724     dtype_count = dtype
    726 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 727 the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
    729 if axis is not None and getattr(the_sum, "ndim", False):
    730     count = cast(np.ndarray, count)

File /opt/hostedtoolcache/Python/3.8.16/x64/lib/python3.8/site-packages/pandas/core/nanops.py:1699, in _ensure_numeric(x)
   1696             x = complex(x)
   1697         except ValueError as err:
   1698             # e.g. "foo"
-> 1699             raise TypeError(f"Could not convert {x} to numeric") from err
   1700 return x

TypeError: Could not convert ab to numeric
dfa = list_of_df[0]
dfa
name treatmenta treatmentb
0 John Smith NaN 2
1 Jane Doe 16.0 11
2 Mary Johnson 3.0 1
dfa.melt(id_vars=['name'],var_name='treatment',value_name='result')
name treatment result
0 John Smith treatmenta NaN
1 Jane Doe treatmenta 16.0
2 Mary Johnson treatmenta 3.0
3 John Smith treatmentb 2.0
4 Jane Doe treatmentb 11.0
5 Mary Johnson treatmentb 1.0
arabica_data_url = 'https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/arabica_data_cleaned.csv'
# load the data
coffee_df = pd.read_csv(arabica_data_url)
# get total bags per country
bags_per_country = coffee_df.groupby('Country.of.Origin')['Number.of.Bags'].sum()

# sort descending, keep only the top 10 and pick out only the country names
top_bags_country_list = bags_per_country.sort_values(ascending=False)[:10].index

# filter the original data for only the countries in the top list
top_coffee_df = coffee_df[coffee_df['Country.of.Origin'].isin(top_bags_country_list)]
bags_per_country
Country.of.Origin
Brazil                          30534
Burundi                           520
China                              55
Colombia                        41204
Costa Rica                      10354
Cote d?Ivoire                       2
Ecuador                             1
El Salvador                      4449
Ethiopia                        11761
Guatemala                       36868
Haiti                             390
Honduras                        13167
India                              20
Indonesia                        1658
Japan                              20
Kenya                            3971
Laos                               81
Malawi                            557
Mauritius                           1
Mexico                          24140
Myanmar                            10
Nicaragua                        6406
Panama                            537
Papua New Guinea                    7
Peru                             2336
Philippines                       259
Rwanda                            150
Taiwan                           1914
Tanzania, United Republic Of     3760
Thailand                         1310
Uganda                           3868
United States                     361
United States (Hawaii)            833
United States (Puerto Rico)        71
Vietnam                            10
Zambia                             13
Name: Number.of.Bags, dtype: int64
top_bags_country_list
Index(['Colombia', 'Guatemala', 'Brazil', 'Mexico', 'Honduras', 'Ethiopia',
       'Costa Rica', 'Nicaragua', 'El Salvador', 'Kenya'],
      dtype='object', name='Country.of.Origin')
top_coffee_df.head(1)
Unnamed: 0 Species Owner Country.of.Origin Farm.Name Lot.Number Mill ICO.Number Company Altitude ... Color Category.Two.Defects Expiration Certification.Body Certification.Address Certification.Contact unit_of_measurement altitude_low_meters altitude_high_meters altitude_mean_meters
0 1 Arabica metad plc Ethiopia metad plc NaN metad plc 2014/2015 metad agricultural developmet plc 1950-2200 ... Green 0 April 3rd, 2016 METAD Agricultural Development plc 309fcf77415a3661ae83e027f7e5f05dad786e44 19fef5a731de2db57d16da10287413f5f99bc2dd m 1950.0 2200.0 2075.0

1 rows × 44 columns

coffee_df.head(1)
Unnamed: 0 Species Owner Country.of.Origin Farm.Name Lot.Number Mill ICO.Number Company Altitude ... Color Category.Two.Defects Expiration Certification.Body Certification.Address Certification.Contact unit_of_measurement altitude_low_meters altitude_high_meters altitude_mean_meters
0 1 Arabica metad plc Ethiopia metad plc NaN metad plc 2014/2015 metad agricultural developmet plc 1950-2200 ... Green 0 April 3rd, 2016 METAD Agricultural Development plc 309fcf77415a3661ae83e027f7e5f05dad786e44 19fef5a731de2db57d16da10287413f5f99bc2dd m 1950.0 2200.0 2075.0

1 rows × 44 columns

coffee_df.shape,top_coffee_df.shape
((1311, 44), (952, 44))
top_coffee_df.describe()
Unnamed: 0 Number.of.Bags Aroma Flavor Aftertaste Acidity Body Balance Uniformity Clean.Cup Sweetness Cupper.Points Total.Cup.Points Moisture Category.One.Defects Quakers Category.Two.Defects altitude_low_meters altitude_high_meters altitude_mean_meters
count 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 952.000000 951.000000 952.000000 830.000000 830.000000 830.000000
mean 653.811975 192.073529 7.557468 7.513330 7.379338 7.533172 7.505662 7.513214 9.839296 9.825557 9.912384 7.483057 82.062626 0.091292 0.380252 0.214511 4.011555 1918.387596 1966.069061 1942.228329
std 378.427772 120.682457 0.400004 0.418425 0.430553 0.403558 0.383316 0.434140 0.584349 0.834365 0.548040 0.469682 3.839237 0.045773 1.658661 0.907420 5.493412 10009.962014 10009.287083 10009.183888
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000
25% 323.750000 50.000000 7.420000 7.330000 7.170000 7.330000 7.330000 7.330000 10.000000 10.000000 10.000000 7.250000 81.170000 0.100000 0.000000 0.000000 1.000000 1150.000000 1200.000000 1200.000000
50% 659.500000 250.000000 7.580000 7.580000 7.420000 7.500000 7.500000 7.500000 10.000000 10.000000 10.000000 7.500000 82.500000 0.110000 0.000000 0.000000 2.000000 1310.640000 1350.000000 1344.000000
75% 972.500000 275.000000 7.750000 7.750000 7.580000 7.750000 7.670000 7.750000 10.000000 10.000000 10.000000 7.750000 83.670000 0.120000 0.000000 0.000000 5.000000 1600.000000 1650.000000 1600.000000
max 1312.000000 1062.000000 8.750000 8.830000 8.670000 8.750000 8.580000 8.750000 10.000000 10.000000 10.000000 9.250000 90.580000 0.220000 31.000000 11.000000 55.000000 190164.000000 190164.000000 190164.000000
top_coffee_df.columns
Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
       'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
       'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')
ratings_of_interest = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', ]
coffe_scores_df = top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,
                   var_name='rating',value_name='score')
coffe_scores_df.head(1)
Country.of.Origin rating score
0 Ethiopia Aroma 8.67
top_coffee_df.melt(id_vars='Country.of.Origin')['variable'].unique()
array(['Unnamed: 0', 'Species', 'Owner', 'Farm.Name', 'Lot.Number',
       'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region', 'Producer',
       'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity',
       'Body', 'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness',
       'Cupper.Points', 'Total.Cup.Points', 'Moisture',
       'Category.One.Defects', 'Quakers', 'Color', 'Category.Two.Defects',
       'Expiration', 'Certification.Body', 'Certification.Address',
       'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters',
       'altitude_mean_meters'], dtype=object)
top_coffee_df.melt(id_vars='Country.of.Origin',value_vars=ratings_of_interest,)['variable'].unique()
array(['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance'],
      dtype=object)
%matplotlib inline
sns.displot(data=coffe_scores_df, x='score',col='Country.of.Origin',
           hue = 'rating',col_wrap=5,kind='kde')
<seaborn.axisgrid.FacetGrid at 0x7f78eeaa40d0>
../_images/ff986bd5a7c69d54326c32167e64b85c31b21ae24d8d57ba7e625e3b446242d8.png
sns.displot(data=coffe_scores_df, x='score',hue='Country.of.Origin',
           col = 'rating',col_wrap=3,kind='kde')
<seaborn.axisgrid.FacetGrid at 0x7f7928960d30>
../_images/9aa0b8ac4e49e461214d74b55c3c47e328d9f7cc06f9744873da4fc5fda421c3.png
top_coffee_df.columns
Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
       'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
       'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')
coffe_scores_df2= top_coffee_df.melt(id_vars=['Country.of.Origin','Color'],value_vars=ratings_of_interest,
                   var_name='rating',value_name='score')
coffe_scores_df2.head(1)
Country.of.Origin Color rating score
0 Ethiopia Green Aroma 8.67
sns.displot(data=coffe_scores_df2, x='score',hue='Country.of.Origin',
           col = 'rating',row='Color',kind='kde')
/tmp/ipykernel_1960/3482930274.py:1: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  sns.displot(data=coffe_scores_df2, x='score',hue='Country.of.Origin',
<seaborn.axisgrid.FacetGrid at 0x7f78edd448b0>
../_images/4ef6a54c403ebdfaf262dc0ef507bc72fd5d269edfbc547e58ad94168f072a38.png
coffee_df.describe()
Unnamed: 0 Number.of.Bags Aroma Flavor Aftertaste Acidity Body Balance Uniformity Clean.Cup Sweetness Cupper.Points Total.Cup.Points Moisture Category.One.Defects Quakers Category.Two.Defects altitude_low_meters altitude_high_meters altitude_mean_meters
count 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1311.00000 1311.000000 1311.000000 1311.000000 1311.000000 1311.000000 1310.000000 1311.000000 1084.000000 1084.000000 1084.000000
mean 656.000763 153.887872 7.563806 7.518070 7.397696 7.533112 7.517727 7.517506 9.833394 9.83312 9.903272 7.497864 82.115927 0.088863 0.426392 0.177099 3.591915 1759.548954 1808.843803 1784.196379
std 378.598733 129.733734 0.378666 0.399979 0.405119 0.381599 0.359213 0.406316 0.559343 0.77135 0.530832 0.474610 3.515761 0.047957 1.832415 0.840583 5.350371 8767.847252 8767.187498 8767.016913
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000
25% 328.500000 14.500000 7.420000 7.330000 7.250000 7.330000 7.330000 7.330000 10.000000 10.00000 10.000000 7.250000 81.170000 0.090000 0.000000 0.000000 0.000000 1100.000000 1100.000000 1100.000000
50% 656.000000 175.000000 7.580000 7.580000 7.420000 7.500000 7.500000 7.500000 10.000000 10.00000 10.000000 7.500000 82.500000 0.110000 0.000000 0.000000 2.000000 1310.640000 1350.000000 1310.640000
75% 983.500000 275.000000 7.750000 7.750000 7.580000 7.750000 7.670000 7.750000 10.000000 10.00000 10.000000 7.750000 83.670000 0.120000 0.000000 0.000000 4.000000 1600.000000 1650.000000 1600.000000
max 1312.000000 1062.000000 8.750000 8.830000 8.670000 8.750000 8.580000 8.750000 10.000000 10.00000 10.000000 10.000000 90.580000 0.280000 31.000000 11.000000 55.000000 190164.000000 190164.000000 190164.000000

7.1. More manipulations#

Here, we will make a tiny DataFrame from scratch to illustrate a couple of points

large_num_df = pd.DataFrame(data= [[730000000,392000000,580200000],
                                   [315040009,580000000,967290000]],
                           columns = ['a','b','c'])
large_num_df
a b c
0 730000000 392000000 580200000
1 315040009 580000000 967290000

This dataet is not tidy, but making it this way was faster to set it up. We could make it tidy using melt as is.

large_num_df.melt()
variable value
0 a 730000000
1 a 315040009
2 b 392000000
3 b 580000000
4 c 580200000
5 c 967290000

However, I want an additional variable, so I wil reset the index, which adds an index column for the original index and adds a new index that is numerical. In this case they’re the same.

large_num_df.reset_index()
index a b c
0 0 730000000 392000000 580200000
1 1 315040009 580000000 967290000

If I melt this one, using the index as the id, then I get a reasonable tidy DataFrame

ls_tall_df = large_num_df.reset_index().melt(id_vars='index')
ls_tall_df
index variable value
0 0 a 730000000
1 1 a 315040009
2 0 b 392000000
3 1 b 580000000
4 0 c 580200000
5 1 c 967290000

Now, we can plot.

sns.catplot(data = ls_tall_df,x='variable',y='value',
            hue='index',kind='bar')
<seaborn.axisgrid.FacetGrid at 0x7f78e8438be0>
../_images/c0cd5dae1613f6e9bef8d23206e1331390273ba48df61fb5e166c7b557eabd49.png

Since the numbers are so big, this might be hard to interpret. Displaying it with all the 0s would not be easier to read. The best thing to do is to add a new colum with adjusted values and a corresponding title.

ls_tall_df['value (millions)'] = ls_tall_df['value']/1000000
ls_tall_df.head()
index variable value value (millions)
0 0 a 730000000 730.000000
1 1 a 315040009 315.040009
2 0 b 392000000 392.000000
3 1 b 580000000 580.000000
4 0 c 580200000 580.200000

Now we can plot again, with the smaller values and an updated axis label. Adding a column with the adjusted title is good practice because it does not lose any data and since we set the value and the title at the same time it keeps it clear what the values are.

sns.catplot(data = ls_tall_df,x='variable',y='value (millions)',
            hue='index',kind='bar')
<seaborn.axisgrid.FacetGrid at 0x7f78e8e8c970>
../_images/f20dac6f3713437643c00683862175ad26abe7272e25a108c77955d1bd40cd5b.png