Skimpy: A light weight tool for creating summary statistics from dataframes

python

Published

December 30, 2022

from skimpy import skim

from skimpy import skim, generate_test_data

df = generate_test_data()

df.head()

	length	width	depth	rnd	class	location	booly_col	text	date	date_no_freq
0	0.762796	1.468082	9	-0.423534	virtginica	UK	False	What weather!	2018-01-31	NaT
1	0.031203	0.267769	10	2.102890	virtginica	UK	False	How are you?	2018-02-28	1992-01-05
2	0.044075	3.571043	12	0.147606	setosa	UK	True	How are you?	2018-03-31	2022-01-01
3	0.914088	2.838664	15	-0.997567	virtginica	NaN	True	<NA>	2018-04-30	NaT
4	0.555878	2.214629	5	0.329828	setosa	UK	False	How are you?	2018-05-31	2022-01-01

df.columns

Index(['length', 'width', 'depth', 'rnd', 'class', 'location', 'booly_col',
       'text', 'date', 'date_no_freq'],
      dtype='object')

df.shape

(1000, 10)

Generate summary statistics with skim

skim(df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types               Categories                                        │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━┓                                │
│ ┃ dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃ ┃ Categorical Variables ┃                                │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ ┡━━━━━━━━━━━━━━━━━━━━━━━┩                                │
│ │ Number of rows    │ 1000   │ │ float64     │ 3     │ │ class                 │                                │
│ │ Number of columns │ 10     │ │ category    │ 2     │ │ location              │                                │
│ └───────────────────┴────────┘ │ datetime64  │ 2     │ └───────────────────────┘                                │
│                                │ int64       │ 1     │                                                          │
│                                │ bool        │ 1     │                                                          │
│                                │ string      │ 1     │                                                          │
│                                └─────────────┴───────┘                                                          │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓  │
│ ┃ column_name      ┃ NA    ┃ NA %    ┃ mean     ┃ sd     ┃ p0         ┃ p25      ┃ p75    ┃ p100   ┃ hist    ┃  │
│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩  │
│ │ length           │     0 │       0 │      0.5 │   0.36 │    1.6e-06 │     0.13 │   0.86 │      1 │ █▃▃▃▄█  │  │
│ │ width            │     0 │       0 │        2 │    1.9 │     0.0021 │      0.6 │      3 │     14 │   █▃▁   │  │
│ │ depth            │     0 │       0 │       10 │    3.2 │          2 │        8 │     12 │     20 │ ▁▄█▆▃▁  │  │
│ │ rnd              │   120 │      12 │    -0.02 │      1 │       -2.8 │    -0.74 │   0.66 │    3.7 │  ▁▄█▅▁  │  │
│ └──────────────────┴───────┴─────────┴──────────┴────────┴────────────┴──────────┴────────┴────────┴─────────┘  │
│                                                    category                                                     │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name                      ┃ NA        ┃ NA %           ┃ ordered               ┃ unique             ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩  │
│ │ class                            │         0 │              0 │ False                 │                  2 │  │
│ │ location                         │         1 │            0.1 │ False                 │                  5 │  │
│ └──────────────────────────────────┴───────────┴────────────────┴───────────────────────┴────────────────────┘  │
│                                                    datetime                                                     │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name             ┃ NA    ┃ NA %      ┃ first               ┃ last                ┃ frequency        ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩  │
│ │ date                    │     0 │         0 │     2018-01-31      │     2101-04-30      │ M                │  │
│ │ date_no_freq            │     3 │       0.3 │     1992-01-05      │     2023-03-04      │ None             │  │
│ └─────────────────────────┴───────┴───────────┴─────────────────────┴─────────────────────┴──────────────────┘  │
│                                                     string                                                      │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name               ┃ NA      ┃ NA %       ┃ words per row                ┃ total words              ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ text                      │       6 │        0.6 │                          5.8 │                     5800 │  │
│ └───────────────────────────┴─────────┴────────────┴──────────────────────────────┴──────────────────────────┘  │
│                                                      bool                                                       │
│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓  │
│ ┃ column_name                        ┃ true            ┃ true rate                     ┃ hist                ┃  │
│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩  │
│ │ booly_col                          │             520 │                          0.52 │       █    █        │  │
│ └────────────────────────────────────┴─────────────────┴───────────────────────────────┴─────────────────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

Clean_name function

skimpy also comes with a clean_columns function as a convenience. This slugifies column names. For example,

import pandas as pd
from rich import print
from skimpy import clean_columns

columns = [
    "bs lncs;n edbn ",
    "Nín hǎo. Wǒ shì zhōng guó rén",
    "___This is a test___",
    "ÜBER Über German Umlaut",
]
messy_df = pd.DataFrame(columns=columns, index=[0], data=[range(len(columns))])
print("Column names:")
print(list(messy_df.columns))

Column names:

['bs lncs;n edbn ', 'Nín hǎo. Wǒ shì zhōng guó rén', '___This is a test___', 'ÜBER Über German Umlaut']

messy_df

	bs lncs;n edbn	Nín hǎo. Wǒ shì zhōng guó rén	___This is a test___	ÜBER Über German Umlaut
0	0	1	2	3

clean_df = clean_columns(messy_df)
print(list(clean_df.columns))

4 column names have been cleaned

['bs_lncs_n_edbn', 'nin_hao_wo_shi_zhong_guo_ren', 'this_is_a_test', 'uber_uber_german_umlaut']