smart-data-analyzer

CSV, Excel, JSON, Parquet 데이터 파일 자동 분석 — 통계 요약, 결측치 탐지, 상관관계, 시각화, 인사이트 도출까지 한번에

Hermes Agent

MIT

스마트 데이터 분석기

데이터 파일(CSV, Excel, JSON, Parquet)을 받아 자동으로 분석하고 인사이트를 도출하는 스킬. execute_code로 파이썬 스크립트를 실행하여 분석 결과를 반환합니다.

언제 사용할까

CSV/Excel/JSON 파일 분석이 필요할 때
"이 데이터 파일 분석해줘"라고 할 때
데이터의 통계적 특성, 패턴, 이상치를 확인할 때
시각화 차트가 필요할 때

전제 조건

Python 3.10+ 와 pip
필요한 패키지: pandas, numpy, matplotlib, seaborn (자동 설치)

1. 빠른 분석 (원라이너)

데이터 파일 경로를 받으면 다음 단계를 자동 수행:

파일 포맷 감지 및 로드
기본 정보 (행/열 수, 데이터타입, 메모리)
통계 요약
결측치 분석
수치형 컬럼 분포
범주형 컬럼 빈도
상관관계 히트맵
주요 인사이트 도출

2. 분석 스크립트 템플릿

2.1 기본 분석 (execute_code로 실행)

import pandas as pd
import numpy as np
FILE_PATH = "/path/to/data.csv"  # ← 실제 파일 경로로 교체
1. 파일 로드 (포맷 자동 감지)

if FILE_PATH.endswith('.csv'):
    df = pd.read_csv(FILE_PATH)
elif FILE_PATH.endswith(('.xlsx', '.xls')):
    df = pd.read_excel(FILE_PATH)
elif FILE_PATH.endswith('.json'):
    df = pd.read_json(FILE_PATH)
elif FILE_PATH.endswith('.parquet'):
    df = pd.read_parquet(FILE_PATH)
else:
    raise ValueError(f"지원하지 않는 포맷: {FILE_PATH}")
2. 기본 정보

print("=" * 50)
print(f"📊 데이터 기본 정보")
print(f"   크기: {df.shape[0]:,} 행 × {df.shape[1]} 열")
print(f"   메모리: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"\n📋 컬럼별 타입:")
for col in df.columns:
    print(f"   {col}: {df[col].dtype} (결측치: {df[col].isnull().sum():,})")
3. 수치형 통계

print(f"\n📈 수치형 컬럼 통계:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(df[numeric_cols].describe().to_string())
else:
    print("   수치형 컬럼 없음")
4. 범주형 컬럼

print(f"\n🏷️ 범주형 컬럼:")
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols[:10]:  # 최대 10개
    unique = df[col].nunique()
    print(f"   {col}: {unique}개 고유값 (최빈값: {df[col].mode().iloc[0] if len(df[col].mode()) > 0 else 'N/A'})")
5. 결측치

missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)
missing_df = pd.DataFrame({'결측치수': missing, '비율(%)': missing_pct})
missing_df = missing_df[missing_df['결측치수'] > 0].sort_values('비율(%)', ascending=False)
if len(missing_df) > 0:
    print(f"\n⚠️ 결측치 현황:")
    print(missing_df.to_string())
else:
    print(f"\n✅ 결측치 없음")
6. 중복행

dup_count = df.duplicated().sum()
print(f"\n🔄 중복행: {dup_count:,}개 ({dup_count/len(df)*100:.1f}%)")
7. 상관관계 (수치형)

if len(numeric_cols) >= 2:
    print(f"\n🔗 높은 상관관계 (|r| > 0.7):")
    corr = df[numeric_cols].corr()
    for i in range(len(corr.columns)):
        for j in range(i+1, len(corr.columns)):
            val = corr.iloc[i, j]
            if abs(val) > 0.7:
                print(f"   {corr.columns[i]} ↔ {corr.columns[j]}: {val:.3f}")

2.2 시각화 생성

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')  # 헤드리스 모드
import matplotlib.pyplot as plt
import seaborn as sns
FILE_PATH = "/path/to/data.csv"
OUTPUT_DIR = "/tmp/data-analysis"
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)
데이터 로드 (위와 동일)

df = pd.read_csv(FILE_PATH)
한글 폰트 설정 (macOS)

plt.rcParams['font.family'] = ['AppleGothic', 'Malgun Gothic', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False
charts = []
1. 수치형 컬럼 히스토그램

numeric_cols = df.select_dtypes(include=[np.number]).columns
n_cols = min(len(numeric_cols), 6)
if n_cols > 0:
    fig, axes = plt.subplots(2, (n_cols+1)//2, figsize=(14, 8))
    axes = axes.flatten()
    for i, col in enumerate(numeric_cols[:n_cols]):
        df[col].hist(ax=axes[i], bins=30, edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{col} 분포')
        axes[i].set_xlabel(col)
    for i in range(n_cols, len(axes)):
        axes[i].set_visible(False)
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/distributions.png"
    plt.savefig(path, dpi=150, bbox_inches='tight')
    plt.close()
    charts.append(path)
2. 상관관계 히트맵

if len(numeric_cols) >= 3:
    fig, ax = plt.subplots(figsize=(10, 8))
    corr = df[numeric_cols].corr()
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax)
    ax.set_title('상관관계 히트맵')
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/correlation.png"
    plt.savefig(path, dpi=150, bbox_inches='tight')
    plt.close()
    charts.append(path)
3. 박스플롯 (이상치 탐지)

if len(numeric_cols) >= 1:
    fig, axes = plt.subplots(1, min(len(numeric_cols), 4), figsize=(14, 5))
    if len(numeric_cols) == 1:
        axes = [axes]
    for i, col in enumerate(numeric_cols[:4]):
        df.boxplot(column=col, ax=axes[i])
        axes[i].set_title(f'{col} 박스플롯')
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/boxplots.png"
    plt.savefig(path, dpi=150, bbox_inches='tight')
    plt.close()
    charts.append(path)
4. 범주형 컬럼 바차트

cat_cols = df.select_dtypes(include=['object', 'category']).columns
n_cats = min(len(cat_cols), 6)
if n_cats > 0:
    fig, axes = plt.subplots(2, (n_cats+1)//2, figsize=(14, 8))
    axes = axes.flatten()
    for i, col in enumerate(cat_cols[:n_cats]):
        top10 = df[col].value_counts().head(10)
        top10.plot(kind='bar', ax=axes[i], edgecolor='black')
        axes[i].set_title(f'{col} Top 10')
        axes[i].tick_params(axis='x', rotation=45)
    for i in range(n_cats, len(axes)):
        axes[i].set_visible(False)
    plt.tight_layout()
    path = f"{OUTPUT_DIR}/categorical.png"
    plt.savefig(path, dpi=150, bbox_inches='tight')
    plt.close()
    charts.append(path)print(f"생성된 차트: {charts}")
for c in charts:
    print(f"  MEDIA:{c}")

2.3 인사이트 자동 도출

import pandas as pd
import numpy as np
FILE_PATH = "/path/to/data.csv"
df = pd.read_csv(FILE_PATH)
insights = []
1. 이상치 탐지 (IQR 방식)

numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5IQR) | (df[col] > Q3 + 1.5IQR)]
    if len(outliers) > 0:
        pct = len(outliers) / len(df) * 100
        insights.append(f"⚠️ '{col}'에 {len(outliers)}개 이상치 발견 ({pct:.1f}%)")
2. 치우침 (Skewness)

for col in numeric_cols:
    skew = df[col].skew()
    if abs(skew) > 2:
        direction = "오른쪽으로 치우침(긴 꼬리)" if skew > 0 else "왼쪽으로 치우침"
        insights.append(f"📊 '{col}'의 분포가 {direction} (skewness={skew:.2f})")
3. 날짜 트렌드 (날짜 컬럼이 있는 경우)

date_cols = []
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            pd.to_datetime(df[col])
            date_cols.append(col)
        except:
            pass
for col in date_cols[:3]:
    df[col] = pd.to_datetime(df[col])
    if len(numeric_cols) > 0:
        target = numeric_cols[0]
        trend = df.groupby(df[col].dt.to_period('M'))[target].mean()
        if len(trend) >= 2:
            first, last = trend.iloc[0], trend.iloc[-1]
            change = ((last - first) / first * 100) if first != 0 else 0
            insights.append(f"📈 '{target}'의 월별 추세: {first:.2f} → {last:.2f} ({change:+.1f}%)")
4. 범주형-수치형 관계

cat_cols = df.select_dtypes(include=['object', 'category']).columns
for cat in cat_cols[:5]:
    for num in numeric_cols[:3]:
        groups = df.groupby(cat)[num].mean()
        if len(groups) <= 20:
            best = groups.idxmax()
            worst = groups.idxmin()
            if best != worst:
                ratio = groups[best] / groups[worst] if groups[worst] != 0 else float('inf')
                insights.append(f"💡 '{cat}'별 '{num}' 평균: '{best}'이 '{worst}'보다 {ratio:.1f}x 높음")print("=" * 50)
print("🔍 데이터 인사이트")
print("=" * 50)
if insights:
    for i, insight in enumerate(insights[:15], 1):
        print(f"{i}. {insight}")
else:
    print("특이사항 없음 — 데이터가 비교적 정상 분포를 따릅니다.")

3. 대용량 파일 처리

파일이 큰 경우 (50MB+):

# 청크로 읽기
chunk_size = 100000
total_rows = 0
for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
    total_rows += len(chunk)
    # 청크별 처리...
또는 dask 사용

import dask.dataframe as dd

ddf = dd.read_csv('large_file.csv')

4. 데이터 전처리 추천

분석 결과를 바탕으로 다음을 자동 제안:

결측치 처리 — 삭제 vs 대체(평균/중앙값/최빈값/KNN)
이상치 처리 — 삭제, 클리핑, 로그 변환
인코딩 — 범주형 → 원핫/라벨/타겟 인코딩
스케일링 — StandardScaler, MinMaxScaler, RobustScaler
특성 엔지니어링 — 날짜 분해, 파생 변수 생성

5. 출력 형식

분석 완료 후 다음 형태로 결과 전달:

📊 데이터 분석 완료 📁 파일: data.csv 📐 크기: 10,000행 × 15열 💾 메모리: 4.2 MB 📌 주요 인사이트: revenue 컬럼에 127개 이상치 (1.3%) age 분포가 오른쪽으로 치우침 (skewness=2.4) category_A가 revenue 평균이 category_B보다 3.2x 높음 ⚠️ 주의사항: email 컬럼에 5.2% 결측치 중복행 23개 존재

📈 시각화: MEDIA:/tmp/data-analysis/distributions.png MEDIA:/tmp/data-analysis/correlation.png

주의사항

시각화 시 반드시 matplotlib.use('Agg') 설정 (헤드리스)
대용량 파일은 청크 단위로 처리
민감한 데이터(개인정보, 금융정보)는 결과 공유 전 마스킹 검토
Excel 파일은 .xlsx만 지원 (구형 .xls는 제한적)

Related Skills / 관련 스킬

data-science

data-analysis

Use this skill when the user uploads Excel (.xlsx/.xls) or CSV files and wants to perform data analysis, generate statistics, create summaries, pivot tables, SQL queries, or any form of structured data exploration. Supports multi-sheet Excel workbooks, aggregation, filtering, joins, and exporting results to CSV/JSON/Markdown.

data-science v1.0.0

ZIP

jupyter-live-kernel

라이브 Jupyter 커널로 상태 유지형 반복 Python 분석