mlearning lab

Diabetes dataset Overview

Rows

10,000

Columns

21

Memory (MB)

4.11

Show code

rows, cols = df.shape
mem_bytes = int(df.memory_usage(deep=True).sum())
mem_mb = round(mem_bytes / (1024*1024), 2)

Integrity Notes

ID-like columns: Unnamed: 0

Show all flagged columns (1)

Column	Flags	Type	Unique Values	Unique Ratio	Missing (%)
Unnamed: 0	id-like	int64	10000	1.0	0.0

Show code

# Build Integrity Notes lists and full flagged table
flag_rows = []
for c in df.columns:
    f = []
    if c in id_like: f.append('id-like')
    if c in hi_card: f.append('high-card')
    if c in consts:  f.append('constant')
    if f:
        flag_rows.append({
           'Column': c,
           'Flags': ', '.join(f),
           'Type': dtypes[c],
           'Unique Values': int(uniques.get(c,0)),
           'Unique Ratio': float(uniq_ratio.get(c,0)),
           'Missing (%)': float(miss_pct.get(c,0))
        })
flagged_df = pd.DataFrame(flag_rows)
flagged_df

Data Preview

Unnamed: 0	Age	Sex	Ethnicity	BMI	Waist_Circumference	Fasting_Blood_Glucose	HbA1c	Blood_Pressure_Systolic	Blood_Pressure_Diastolic	Cholesterol_Total	Cholesterol_HDL	Cholesterol_LDL	GGT	Serum_Urate	Physical_Activity_Level	Dietary_Intake_Calories	Alcohol_Consumption	Smoking_Status	Family_History_of_Diabetes	Previous_Gestational_Diabetes
0	58	Female	White	35.8	83.4	123.9	10.9	152	114	197.8	50.2	99.2	37.5	7.2	Moderate	1538	Moderate	Never	0	1
1	48	Male	Asian	24.1	71.4	183.7	12.8	103	91	261.6	62.0	146.4	88.5	6.1	Moderate	2653	Moderate	Current	0	1
2	34	Female	Black	25.0	113.8	142.0	14.5	179	104	261.0	32.1	164.1	56.2	6.9	Low	1684	Heavy	Former	1	0
3	62	Male	Asian	32.7	100.4	167.4	8.8	176	118	183.4	41.1	84.0	34.4	5.4	Low	3796	Moderate	Never	1	0
4	27	Female	Asian	33.5	110.8	146.4	7.1	122	97	203.2	53.9	92.8	81.9	7.4	Moderate	3161	Heavy	Current	0	0
5	40	Female	Asian	33.6	96.1	75.0	13.5	170	90	152.3	44.5	190.0	77.5	6.4	Low	3460	NaN	Never	1	1
6	58	Male	Black	33.2	100.0	97.7	13.3	131	80	199.8	77.9	73.4	52.1	4.7	High	3107	Moderate	Never	0	0
7	38	Female	Hispanic	26.9	105.0	80.2	10.9	121	83	154.0	69.7	122.2	72.0	5.6	Moderate	2390	Heavy	Current	0	1

Show code

df[['Unnamed: 0', 'Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level', 'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status', 'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']].head(8)

Summary Statistics

Unnamed: 0	Age	BMI	Waist_Circumference	Fasting_Blood_Glucose	HbA1c	Blood_Pressure_Systolic	Blood_Pressure_Diastolic	Cholesterol_Total	Cholesterol_HDL	Cholesterol_LDL	GGT	Serum_Urate	Dietary_Intake_Calories	Family_History_of_Diabetes	Previous_Gestational_Diabetes
10000.00000	10000.000000	10000.000000	10000.00000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.00000	10000.00000	10000.000000	10000.000000	10000.000000	10000.000000
4999.50000	44.620400	29.418150	94.79707	134.776210	9.507510	134.163700	89.558700	225.165540	55.019340	134.35377	55.16822	5.503430	2742.481900	0.507000	0.516500
2886.89568	14.343489	6.170866	14.38329	37.633544	3.176421	26.110317	17.237792	42.963744	14.537371	37.50238	25.88180	1.455091	716.643803	0.499976	0.499753
0.00000	20.000000	18.500000	70.00000	70.000000	4.000000	90.000000	60.000000	150.000000	30.000000	70.00000	10.00000	3.000000	1500.000000	0.000000	0.000000
2499.75000	32.000000	24.100000	82.20000	102.175000	6.800000	112.000000	75.000000	187.875000	42.300000	101.67500	32.60000	4.200000	2129.000000	0.000000	0.000000
4999.50000	45.000000	29.500000	94.90000	134.500000	9.500000	134.000000	89.000000	225.500000	55.200000	134.40000	55.45000	5.500000	2727.000000	1.000000	1.000000
7499.25000	57.000000	34.700000	107.00000	167.800000	12.300000	157.000000	105.000000	262.400000	67.900000	166.40000	77.50000	6.800000	3368.000000	1.000000	1.000000
9999.00000	69.000000	40.000000	120.00000	200.000000	15.000000	179.000000	119.000000	300.000000	80.000000	200.00000	100.00000	8.000000	3999.000000	1.000000	1.000000

Show code

df[['Unnamed: 0', 'Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level', 'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status', 'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']].describe()

Column Profile

Column	Type	Non-Null Count	Missing (%)	Unique Values	Sample Values	Flags
Unnamed: 0	int64	10000	0.0	10000	0, 1, 2	id-like
Age	int64	10000	0.0	50	58, 48, 34
Sex	object	10000	0.0	2	Female, Male
Ethnicity	object	10000	0.0	4	White, Asian, Black
BMI	float64	10000	0.0	216	35.8, 24.1, 25.0
Waist_Circumference	float64	10000	0.0	501	83.4, 71.4, 113.8
Fasting_Blood_Glucose	float64	10000	0.0	1301	123.9, 183.7, 142.0
HbA1c	float64	10000	0.0	111	10.9, 12.8, 14.5
Blood_Pressure_Systolic	int64	10000	0.0	90	152, 103, 179
Blood_Pressure_Diastolic	int64	10000	0.0	60	114, 91, 104
Cholesterol_Total	float64	10000	0.0	1499	197.8, 261.6, 261.0
Cholesterol_HDL	float64	10000	0.0	501	50.2, 62.0, 32.1
Cholesterol_LDL	float64	10000	0.0	1301	99.2, 146.4, 164.1
GGT	float64	10000	0.0	901	37.5, 88.5, 56.2
Serum_Urate	float64	10000	0.0	51	7.2, 6.1, 6.9
Physical_Activity_Level	object	10000	0.0	3	Moderate, Low, High
Dietary_Intake_Calories	int64	10000	0.0	2451	1538, 2653, 1684
Alcohol_Consumption	object	6680	33.2	2	Moderate, Heavy
Smoking_Status	object	10000	0.0	3	Never, Current, Former
Family_History_of_Diabetes	int64	10000	0.0	2	0, 1
Previous_Gestational_Diabetes	int64	10000	0.0	2	1, 0

Show code

dtypes = df.dtypes.astype(str)
nonnull = df.notnull().sum()
miss_pct = (df.isnull().mean()*100).round(1)
uniques = df.nunique(dropna=True)
n = max(len(df), 1)
uniq_ratio = (uniques / n).fillna(0.0)
def _sample_vals(s, k=3):
    vals = pd.unique(s.dropna().astype(str))[:k]
    return ', '.join(map(str, vals)) if len(vals) else ''
flags_col = []
for c in df.columns:
    flags=[]
    if uniques.get(c,0) <= 1: flags.append('constant')
    if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')
    if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')
    flags_col.append(', '.join(flags))
profile_df = pd.DataFrame({
  'Column': df.columns,
  'Type': dtypes.values,
  'Non-Null Count': nonnull.values,
  'Missing (%)': miss_pct.values,
  'Unique Values': uniques.values,
  'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],
  'Flags': flags_col
})
profile_df

Column Types

Column	Type	Non-Null Count	Unique Values
Unnamed: 0	int64	10000	10000
Age	int64	10000	50
Sex	object	10000	2
Ethnicity	object	10000	4
BMI	float64	10000	216
Waist_Circumference	float64	10000	501
Fasting_Blood_Glucose	float64	10000	1301
HbA1c	float64	10000	111
Blood_Pressure_Systolic	int64	10000	90
Blood_Pressure_Diastolic	int64	10000	60
Cholesterol_Total	float64	10000	1499
Cholesterol_HDL	float64	10000	501
Cholesterol_LDL	float64	10000	1301
GGT	float64	10000	901
Serum_Urate	float64	10000	51
Physical_Activity_Level	object	10000	3
Dietary_Intake_Calories	int64	10000	2451
Alcohol_Consumption	object	6680	2
Smoking_Status	object	10000	3
Family_History_of_Diabetes	int64	10000	2
Previous_Gestational_Diabetes	int64	10000	2

Show code

pd.DataFrame({
    'Column': df.columns,
    'Type': df.dtypes.astype(str).values,
    'Non-Null Count': df.notnull().sum().values,
    'Unique Values': df.nunique().values
})

Outliers — Top 3 records

Row index: 5301 · score: 1.42

HbA1c

1.42

Waist_Circumference

1.23

BMI

1.22

Serum_Urate

0.99

Cholesterol_Total

0.92

Unnamed: 0	Age	Sex	Ethnicity	BMI	Waist_Circumference	Fasting_Blood_Glucose	HbA1c	Blood_Pressure_Systolic	Blood_Pressure_Diastolic	Cholesterol_Total	Cholesterol_HDL	Cholesterol_LDL	GGT	Serum_Urate	Physical_Activity_Level	Dietary_Intake_Calories	Alcohol_Consumption	Smoking_Status	Family_History_of_Diabetes	Previous_Gestational_Diabetes
5301	57	Male	White	39.3	117.6	110.5	4.0	149	104	278.4	71.9	157.5	34.4	3.6	Moderate	2086	Moderate	Former	0	1

Row index: 1536 · score: 1.42

HbA1c

1.42

Blood_Pressure_Systolic

1.23

Age

1.01

GGT

0.99

Unnamed: 0

0.94

Unnamed: 0	Age	Sex	Ethnicity	BMI	Waist_Circumference	Fasting_Blood_Glucose	HbA1c	Blood_Pressure_Systolic	Blood_Pressure_Diastolic	Cholesterol_Total	Cholesterol_HDL	Cholesterol_LDL	GGT	Serum_Urate	Physical_Activity_Level	Dietary_Intake_Calories	Alcohol_Consumption	Smoking_Status	Family_History_of_Diabetes	Previous_Gestational_Diabetes
1536	27	Female	Hispanic	27.7	102.7	136.1	4.0	174	71	270.2	48.8	144.8	88.3	6.6	High	3388	Moderate	Former	0	1

Row index: 3100 · score: 1.42

HbA1c

1.42

Fasting_Blood_Glucose

1.30

Dietary_Intake_Calories

1.30

Blood_Pressure_Diastolic

1.26

Serum_Urate

1.09

Unnamed: 0	Age	Sex	Ethnicity	BMI	Waist_Circumference	Fasting_Blood_Glucose	HbA1c	Blood_Pressure_Systolic	Blood_Pressure_Diastolic	Cholesterol_Total	Cholesterol_HDL	Cholesterol_LDL	GGT	Serum_Urate	Physical_Activity_Level	Dietary_Intake_Calories	Alcohol_Consumption	Smoking_Status	Family_History_of_Diabetes	Previous_Gestational_Diabetes
3100	44	Male	Black	25.9	85.2	197.8	4.0	143	117	250.6	70.4	148.9	50.0	7.6	Low	1539	Moderate	Never	1	1

Show code

num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]
df_num = df[num_cols]
df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num
med = df_num.median(); mad = (df_num - med).abs().median()
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
row_score = rz.abs().max(axis=1)
top3 = row_score.nlargest(3)

Outliers — Violin + Box (Top 3 numerics by IQR outliers)

Show code

dfn = df.select_dtypes(include='number').iloc[:, :max_cols]
dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn
# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box

Missing Values

Column	Missing Values	Missing (%)
Alcohol_Consumption	3320	33.2

Show code

nulls = df.isnull().sum()
nulls_pct = (df.isnull().mean() * 100).round(1)
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': nulls.values,
    'Missing (%)': nulls_pct.values
})\nmissing_df[missing_df['Missing Values'] > 0]

Missingness (Top 20)

Show code

nulls = df.isnull().sum();
nulls_pct = (
    df.isnull().mean()*100
).round(1)
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': nulls.values,
    'Missing (%)': nulls_pct.values
})

top_miss = (
    missing_df[missing_df['Missing Values'] > 0]
       .sort_values('Missing (%)', ascending=False)
       .loc[:, ['Column', 'Missing (%)']]
       .head(20)
       .reset_index(drop=True)
)
top_miss

Category Distribution — (Ethnicity)

Show code

dist_col = '<chosen categorical>'
s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')
vc = s.value_counts(dropna=False)
top_k = 8  # Top-8 + Other (+ Missing)

ML Lab

Explore Data

Diabetes dataset Overview

Rows

Columns

Memory (MB)

Integrity Notes

Data Preview

Summary Statistics

Column Profile

Column Types

Outliers — Top 3 records

Outliers — Violin + Box (Top 3 numerics by IQR outliers)

Missing Values

Missingness (Top 20)

Category Distribution — (Ethnicity)