rows, cols = df.shape
mem_bytes = int(df.memory_usage(deep=True).sum())
mem_mb = round(mem_bytes / (1024*1024), 2)
| Column | Flags | Type | Unique Values | Unique Ratio | Missing (%) |
|---|---|---|---|---|---|
| Unnamed: 0 | id-like | int64 | 10000 | 1.0 | 0.0 |
# Build Integrity Notes lists and full flagged table
flag_rows = []
for c in df.columns:
f = []
if c in id_like: f.append('id-like')
if c in hi_card: f.append('high-card')
if c in consts: f.append('constant')
if f:
flag_rows.append({
'Column': c,
'Flags': ', '.join(f),
'Type': dtypes[c],
'Unique Values': int(uniques.get(c,0)),
'Unique Ratio': float(uniq_ratio.get(c,0)),
'Missing (%)': float(miss_pct.get(c,0))
})
flagged_df = pd.DataFrame(flag_rows)
flagged_df
| Unnamed: 0 | Age | Sex | Ethnicity | BMI | Waist_Circumference | Fasting_Blood_Glucose | HbA1c | Blood_Pressure_Systolic | Blood_Pressure_Diastolic | Cholesterol_Total | Cholesterol_HDL | Cholesterol_LDL | GGT | Serum_Urate | Physical_Activity_Level | Dietary_Intake_Calories | Alcohol_Consumption | Smoking_Status | Family_History_of_Diabetes | Previous_Gestational_Diabetes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58 | Female | White | 35.8 | 83.4 | 123.9 | 10.9 | 152 | 114 | 197.8 | 50.2 | 99.2 | 37.5 | 7.2 | Moderate | 1538 | Moderate | Never | 0 | 1 |
| 1 | 48 | Male | Asian | 24.1 | 71.4 | 183.7 | 12.8 | 103 | 91 | 261.6 | 62.0 | 146.4 | 88.5 | 6.1 | Moderate | 2653 | Moderate | Current | 0 | 1 |
| 2 | 34 | Female | Black | 25.0 | 113.8 | 142.0 | 14.5 | 179 | 104 | 261.0 | 32.1 | 164.1 | 56.2 | 6.9 | Low | 1684 | Heavy | Former | 1 | 0 |
| 3 | 62 | Male | Asian | 32.7 | 100.4 | 167.4 | 8.8 | 176 | 118 | 183.4 | 41.1 | 84.0 | 34.4 | 5.4 | Low | 3796 | Moderate | Never | 1 | 0 |
| 4 | 27 | Female | Asian | 33.5 | 110.8 | 146.4 | 7.1 | 122 | 97 | 203.2 | 53.9 | 92.8 | 81.9 | 7.4 | Moderate | 3161 | Heavy | Current | 0 | 0 |
| 5 | 40 | Female | Asian | 33.6 | 96.1 | 75.0 | 13.5 | 170 | 90 | 152.3 | 44.5 | 190.0 | 77.5 | 6.4 | Low | 3460 | NaN | Never | 1 | 1 |
| 6 | 58 | Male | Black | 33.2 | 100.0 | 97.7 | 13.3 | 131 | 80 | 199.8 | 77.9 | 73.4 | 52.1 | 4.7 | High | 3107 | Moderate | Never | 0 | 0 |
| 7 | 38 | Female | Hispanic | 26.9 | 105.0 | 80.2 | 10.9 | 121 | 83 | 154.0 | 69.7 | 122.2 | 72.0 | 5.6 | Moderate | 2390 | Heavy | Current | 0 | 1 |
df[['Unnamed: 0', 'Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level', 'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status', 'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']].head(8)
| Unnamed: 0 | Age | BMI | Waist_Circumference | Fasting_Blood_Glucose | HbA1c | Blood_Pressure_Systolic | Blood_Pressure_Diastolic | Cholesterol_Total | Cholesterol_HDL | Cholesterol_LDL | GGT | Serum_Urate | Dietary_Intake_Calories | Family_History_of_Diabetes | Previous_Gestational_Diabetes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10000.00000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.00000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 |
| 4999.50000 | 44.620400 | 29.418150 | 94.79707 | 134.776210 | 9.507510 | 134.163700 | 89.558700 | 225.165540 | 55.019340 | 134.35377 | 55.16822 | 5.503430 | 2742.481900 | 0.507000 | 0.516500 |
| 2886.89568 | 14.343489 | 6.170866 | 14.38329 | 37.633544 | 3.176421 | 26.110317 | 17.237792 | 42.963744 | 14.537371 | 37.50238 | 25.88180 | 1.455091 | 716.643803 | 0.499976 | 0.499753 |
| 0.00000 | 20.000000 | 18.500000 | 70.00000 | 70.000000 | 4.000000 | 90.000000 | 60.000000 | 150.000000 | 30.000000 | 70.00000 | 10.00000 | 3.000000 | 1500.000000 | 0.000000 | 0.000000 |
| 2499.75000 | 32.000000 | 24.100000 | 82.20000 | 102.175000 | 6.800000 | 112.000000 | 75.000000 | 187.875000 | 42.300000 | 101.67500 | 32.60000 | 4.200000 | 2129.000000 | 0.000000 | 0.000000 |
| 4999.50000 | 45.000000 | 29.500000 | 94.90000 | 134.500000 | 9.500000 | 134.000000 | 89.000000 | 225.500000 | 55.200000 | 134.40000 | 55.45000 | 5.500000 | 2727.000000 | 1.000000 | 1.000000 |
| 7499.25000 | 57.000000 | 34.700000 | 107.00000 | 167.800000 | 12.300000 | 157.000000 | 105.000000 | 262.400000 | 67.900000 | 166.40000 | 77.50000 | 6.800000 | 3368.000000 | 1.000000 | 1.000000 |
| 9999.00000 | 69.000000 | 40.000000 | 120.00000 | 200.000000 | 15.000000 | 179.000000 | 119.000000 | 300.000000 | 80.000000 | 200.00000 | 100.00000 | 8.000000 | 3999.000000 | 1.000000 | 1.000000 |
df[['Unnamed: 0', 'Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level', 'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status', 'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes']].describe()
| Column | Type | Non-Null Count | Missing (%) | Unique Values | Sample Values | Flags |
|---|---|---|---|---|---|---|
| Unnamed: 0 | int64 | 10000 | 0.0 | 10000 | 0, 1, 2 | id-like |
| Age | int64 | 10000 | 0.0 | 50 | 58, 48, 34 | |
| Sex | object | 10000 | 0.0 | 2 | Female, Male | |
| Ethnicity | object | 10000 | 0.0 | 4 | White, Asian, Black | |
| BMI | float64 | 10000 | 0.0 | 216 | 35.8, 24.1, 25.0 | |
| Waist_Circumference | float64 | 10000 | 0.0 | 501 | 83.4, 71.4, 113.8 | |
| Fasting_Blood_Glucose | float64 | 10000 | 0.0 | 1301 | 123.9, 183.7, 142.0 | |
| HbA1c | float64 | 10000 | 0.0 | 111 | 10.9, 12.8, 14.5 | |
| Blood_Pressure_Systolic | int64 | 10000 | 0.0 | 90 | 152, 103, 179 | |
| Blood_Pressure_Diastolic | int64 | 10000 | 0.0 | 60 | 114, 91, 104 | |
| Cholesterol_Total | float64 | 10000 | 0.0 | 1499 | 197.8, 261.6, 261.0 | |
| Cholesterol_HDL | float64 | 10000 | 0.0 | 501 | 50.2, 62.0, 32.1 | |
| Cholesterol_LDL | float64 | 10000 | 0.0 | 1301 | 99.2, 146.4, 164.1 | |
| GGT | float64 | 10000 | 0.0 | 901 | 37.5, 88.5, 56.2 | |
| Serum_Urate | float64 | 10000 | 0.0 | 51 | 7.2, 6.1, 6.9 | |
| Physical_Activity_Level | object | 10000 | 0.0 | 3 | Moderate, Low, High | |
| Dietary_Intake_Calories | int64 | 10000 | 0.0 | 2451 | 1538, 2653, 1684 | |
| Alcohol_Consumption | object | 6680 | 33.2 | 2 | Moderate, Heavy | |
| Smoking_Status | object | 10000 | 0.0 | 3 | Never, Current, Former | |
| Family_History_of_Diabetes | int64 | 10000 | 0.0 | 2 | 0, 1 | |
| Previous_Gestational_Diabetes | int64 | 10000 | 0.0 | 2 | 1, 0 |
dtypes = df.dtypes.astype(str)
nonnull = df.notnull().sum()
miss_pct = (df.isnull().mean()*100).round(1)
uniques = df.nunique(dropna=True)
n = max(len(df), 1)
uniq_ratio = (uniques / n).fillna(0.0)
def _sample_vals(s, k=3):
vals = pd.unique(s.dropna().astype(str))[:k]
return ', '.join(map(str, vals)) if len(vals) else ''
flags_col = []
for c in df.columns:
flags=[]
if uniques.get(c,0) <= 1: flags.append('constant')
if uniq_ratio.get(c,0) >= 0.95 and 'datetime' not in dtypes[c].lower(): flags.append('id-like')
if dtypes[c].startswith('object') and uniq_ratio.get(c,0) > 0.5 and 'id-like' not in flags: flags.append('high-card')
flags_col.append(', '.join(flags))
profile_df = pd.DataFrame({
'Column': df.columns,
'Type': dtypes.values,
'Non-Null Count': nonnull.values,
'Missing (%)': miss_pct.values,
'Unique Values': uniques.values,
'Sample Values': [ _sample_vals(df[c]) for c in df.columns ],
'Flags': flags_col
})
profile_df
| Column | Type | Non-Null Count | Unique Values |
|---|---|---|---|
| Unnamed: 0 | int64 | 10000 | 10000 |
| Age | int64 | 10000 | 50 |
| Sex | object | 10000 | 2 |
| Ethnicity | object | 10000 | 4 |
| BMI | float64 | 10000 | 216 |
| Waist_Circumference | float64 | 10000 | 501 |
| Fasting_Blood_Glucose | float64 | 10000 | 1301 |
| HbA1c | float64 | 10000 | 111 |
| Blood_Pressure_Systolic | int64 | 10000 | 90 |
| Blood_Pressure_Diastolic | int64 | 10000 | 60 |
| Cholesterol_Total | float64 | 10000 | 1499 |
| Cholesterol_HDL | float64 | 10000 | 501 |
| Cholesterol_LDL | float64 | 10000 | 1301 |
| GGT | float64 | 10000 | 901 |
| Serum_Urate | float64 | 10000 | 51 |
| Physical_Activity_Level | object | 10000 | 3 |
| Dietary_Intake_Calories | int64 | 10000 | 2451 |
| Alcohol_Consumption | object | 6680 | 2 |
| Smoking_Status | object | 10000 | 3 |
| Family_History_of_Diabetes | int64 | 10000 | 2 |
| Previous_Gestational_Diabetes | int64 | 10000 | 2 |
pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes.astype(str).values,
'Non-Null Count': df.notnull().sum().values,
'Unique Values': df.nunique().values
})
| Unnamed: 0 | Age | Sex | Ethnicity | BMI | Waist_Circumference | Fasting_Blood_Glucose | HbA1c | Blood_Pressure_Systolic | Blood_Pressure_Diastolic | Cholesterol_Total | Cholesterol_HDL | Cholesterol_LDL | GGT | Serum_Urate | Physical_Activity_Level | Dietary_Intake_Calories | Alcohol_Consumption | Smoking_Status | Family_History_of_Diabetes | Previous_Gestational_Diabetes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5301 | 57 | Male | White | 39.3 | 117.6 | 110.5 | 4.0 | 149 | 104 | 278.4 | 71.9 | 157.5 | 34.4 | 3.6 | Moderate | 2086 | Moderate | Former | 0 | 1 |
| Unnamed: 0 | Age | Sex | Ethnicity | BMI | Waist_Circumference | Fasting_Blood_Glucose | HbA1c | Blood_Pressure_Systolic | Blood_Pressure_Diastolic | Cholesterol_Total | Cholesterol_HDL | Cholesterol_LDL | GGT | Serum_Urate | Physical_Activity_Level | Dietary_Intake_Calories | Alcohol_Consumption | Smoking_Status | Family_History_of_Diabetes | Previous_Gestational_Diabetes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1536 | 27 | Female | Hispanic | 27.7 | 102.7 | 136.1 | 4.0 | 174 | 71 | 270.2 | 48.8 | 144.8 | 88.3 | 6.6 | High | 3388 | Moderate | Former | 0 | 1 |
| Unnamed: 0 | Age | Sex | Ethnicity | BMI | Waist_Circumference | Fasting_Blood_Glucose | HbA1c | Blood_Pressure_Systolic | Blood_Pressure_Diastolic | Cholesterol_Total | Cholesterol_HDL | Cholesterol_LDL | GGT | Serum_Urate | Physical_Activity_Level | Dietary_Intake_Calories | Alcohol_Consumption | Smoking_Status | Family_History_of_Diabetes | Previous_Gestational_Diabetes |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3100 | 44 | Male | Black | 25.9 | 85.2 | 197.8 | 4.0 | 143 | 117 | 250.6 | 70.4 | 148.9 | 50.0 | 7.6 | Low | 1539 | Moderate | Never | 1 | 1 |
num_cols = df.select_dtypes(include='number').columns.tolist()[:max_cols]
df_num = df[num_cols]
df_num = df_num.sample(max_rows, random_state=0) if len(df_num) > max_rows else df_num
med = df_num.median(); mad = (df_num - med).abs().median()
rz = 0.6745 * (df_num - med) / mad.replace(0, np.nan)
row_score = rz.abs().max(axis=1)
top3 = row_score.nlargest(3)
dfn = df.select_dtypes(include='number').iloc[:, :max_cols]
dfn = dfn.sample(max_rows, random_state=0) if len(dfn) > max_rows else dfn
# rank columns by Tukey outliers (1.5*IQR) and plot violins with inner box
| Column | Missing Values | Missing (%) |
|---|---|---|
| Alcohol_Consumption | 3320 | 33.2 |
nulls = df.isnull().sum()
nulls_pct = (df.isnull().mean() * 100).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})\nmissing_df[missing_df['Missing Values'] > 0]
nulls = df.isnull().sum();
nulls_pct = (
df.isnull().mean()*100
).round(1)
missing_df = pd.DataFrame({
'Column': df.columns,
'Missing Values': nulls.values,
'Missing (%)': nulls_pct.values
})
top_miss = (
missing_df[missing_df['Missing Values'] > 0]
.sort_values('Missing (%)', ascending=False)
.loc[:, ['Column', 'Missing (%)']]
.head(20)
.reset_index(drop=True)
)
top_miss
dist_col = '<chosen categorical>'
s = df[dist_col].astype('object').where(~df[dist_col].isna(), other='Missing')
vc = s.value_counts(dropna=False)
top_k = 8 # Top-8 + Other (+ Missing)