Коллекция песен из индийского кинематографа DataSet: различия между версиями

Материал из Поле цифровой дидактики
Нет описания правки
Нет описания правки
Строка 65: Строка 65:
# Анализ БД
# Анализ БД


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
# ==========================================
# A) Imports + Global Config
# ==========================================
np.random.seed(42)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
sns.set_theme(style="whitegrid")
# Provided Inputs
DATASET_NAME = "moonknightmarvel/dataset-of-songs-with-genreartistmovielanguage"
EXACT_COLUMNS = ["title", "artist", "movie", "language", "emotion"]
TARGET_COL = None
# ==========================================
# B) Helper Functions (Robust & Defensive)
# ==========================================
def safe_read_csv(path):
    try:
        # DATA_DIR is assumed to be defined in the environment per instructions
        return pd.read_csv(path)
    except Exception as e:
        print(f"CRITICAL ERROR: Could not read CSV at {path}. Error: {e}")
        return pd.DataFrame()
def validate_columns(df, expected_cols):
    if df.empty or not expected_cols:
        return
    actual_cols = df.columns.tolist()
    missing = [c for c in expected_cols if c not in actual_cols]
    extra = [c for c in actual_cols if c not in expected_cols]
   
    print("-" * 30)
    print(f"COLUMN VALIDATION: {DATASET_NAME}")
    if not missing and not extra:
        print("Success: All expected columns found. No extra columns.")
    else:
        if missing: print(f"Missing expected columns: {missing}")
        if extra: print(f"Extra columns found: {extra}")
    print("-" * 30)
def audit_missingness(df):
    null_counts = df.isnull().sum()
    null_pct = (null_counts / len(df)) * 100
    non_null = df.notnull().sum()
    audit = pd.DataFrame({
        'Null Count': null_counts,
        'Null %': null_pct,
        'Non-Null Count': non_null
    })
    return audit
def detect_column_types(df):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    date_cols = []
   
    # Simple heuristic for potential date columns
    for col in cat_cols:
        sample = df[col].dropna().head(5).astype(str)
        if any(sample.str.contains(r'\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}', regex=True)):
            date_cols.append(col)
           
    # Refine cat_cols (remove dates)
    cat_cols = [c for c in cat_cols if c not in date_cols]
    return num_cols, cat_cols, date_cols
def safe_to_numeric(series):
    try:
        return pd.to_numeric(series, errors='coerce')
    except:
        return series
def safe_to_datetime(series):
    try:
        return pd.to_datetime(series, errors='coerce')
    except:
        return series
def plot_missingness(df):
    null_pct = (df.isnull().sum() / len(df)) * 100
    null_pct = null_pct[null_pct > 0].sort_values(ascending=False).head(30)
    if not null_pct.empty:
        sns.barplot(x=null_pct.values, y=null_pct.index, hue=null_pct.index, palette='Reds_r', legend=False)
        plt.title("Top Columns by Missing Percentage (%)")
        plt.xlabel("% Missing")
        plt.show()
def plot_univariate_num(df, num_cols):
    cols_to_plot = num_cols[:12]
    if not cols_to_plot: return
    n = len(cols_to_plot)
    rows = (n + 2) // 3
    fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))
    axes = axes.flatten()
    for i, col in enumerate(cols_to_plot):
        sns.histplot(df[col].dropna(), kde=True, ax=axes[i], color='teal')
        axes[i].set_title(f"Distribution of {col}")
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.tight_layout()
    plt.show()
def plot_univariate_cat(df, cat_cols):
    cols_to_plot = cat_cols[:6]
    if not cols_to_plot: return
    for col in cols_to_plot:
        if df[col].nunique() > 50:
            top_vals = df[col].value_counts().head(15)
        else:
            top_vals = df[col].value_counts().head(15)
       
        plt.figure(figsize=(10, 5))
        sns.barplot(x=top_vals.values, y=top_vals.index, hue=top_vals.index, palette='viridis', legend=False)
        plt.title(f"Top 15 Categories: {col}")
        plt.show()
def plot_correlation(df, num_cols):
    if len(num_cols) < 2: return
    corr = df[num_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Pearson Correlation Matrix")
    plt.show()
# ==========================================
# C) Load + Validate
# ==========================================
df = safe_read_csv(DATA_DIR)
validate_columns(df, EXACT_COLUMNS)
if not df.empty:
    # ==========================================
    # D) Data Audit
    # ==========================================
    print(f"\n--- DATA AUDIT: {DATASET_NAME} ---")
    print(f"Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Duplicates: {df.duplicated().sum()}")
   
    null_audit = audit_missingness(df)
    print("\nNull Audit Summary (Top 5 Missing):")
    print(null_audit.sort_values(by='Null %', ascending=False).head(5))
   
    num_cols, cat_cols, date_cols = detect_column_types(df)
   
    print(f"\nDetected Numeric Columns: {num_cols}")
    print(f"Detected Categorical Columns: {cat_cols}")
    print(f"Detected Date Columns: {date_cols}")
   
    # Numeric Stability
    if num_cols:
        inf_counts = np.isinf(df[num_cols]).sum().sum()
        print(f"Total Inf/-Inf values: {inf_counts}")
        low_var = [c for c in num_cols if df[c].std() < 0.01]
        if low_var: print(f"Low variance columns: {low_var}")
    # ==========================================
    # E) ETL (Safe + Reversible)
    # ==========================================
    df_clean = df.copy()
   
    # Strip whitespace & Unify Missing Tokens
    missing_tokens = ["", "NA", "N/A", "null", "None", "nan"]
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].astype(str).str.strip()
            df_clean[col] = df_clean[col].replace(missing_tokens, np.nan)
   
    # Attempt to convert object columns to numeric if they are high-signal
    for col in cat_cols:
        sample = df_clean[col].dropna().head(10)
        if sample.str.match(r'^-?\d+(\.\d+)?$').all():
            df_clean[col] = safe_to_numeric(df_clean[col])
   
    # Handle Duplicates
    df_clean = df_clean.drop_duplicates(keep='first')
   
    # Recalculate types after cleaning
    num_cols, cat_cols, date_cols = detect_column_types(df_clean)
   
    # Missing Value Handling (Imputation + Indicators)
    for col in num_cols:
        if df_clean[col].isnull().any():
            df_clean[f"{col}__was_missing"] = df_clean[col].isnull().astype(int)
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())
           
    for col in cat_cols:
        if df_clean[col].isnull().any():
            df_clean[f"{col}__was_missing"] = df_clean[col].isnull().astype(int)
            df_clean[col] = df_clean[col].fillna("Missing")
    # ==========================================
    # F) EDA (Univariate & Bivariate)
    # ==========================================
    if num_cols:
        print("\nNumeric Distribution Summary:")
        stats = df_clean[num_cols].agg(['mean', 'std', 'min', 'median', 'max', 'skew'])
        print(stats.T)
       
        if len(num_cols) >= 2:
            high_corr = []
            corr_mat = df_clean[num_cols].corr().abs()
            upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
            for col in upper.columns:
                for row in upper.index:
                    if upper.loc[row, col] >= 0.85:
                        high_corr.append((row, col, upper.loc[row, col]))
            if high_corr:
                print("\nHighly Correlated Pairs (|r| >= 0.85):")
                for r, c, v in high_corr: print(f" - {r} & {c}: {v:.3f}")
    # ==========================================
    # G) Feature Engineering (Lightweight)
    # ==========================================
    # String Lengths for Title/Artist etc.
    text_candidates = [c for c in cat_cols if c in ['title', 'artist', 'movie', 'language']]
    for col in text_candidates:
        if col in df_clean.columns:
            df_clean[f"{col}__len"] = df_clean[col].astype(str).apply(len)
            df_clean[f"{col}__words"] = df_clean[col].astype(str).apply(lambda x: len(x.split()))
           
    # Datetime expansion
    for col in date_cols:
        df_clean[col] = safe_to_datetime(df_clean[col])
        if not df_clean[col].isnull().all():
            df_clean[f"{col}__year"] = df_clean[col].dt.year
            df_clean[f"{col}__month"] = df_clean[col].dt.month
            df_clean[f"{col}__dayofweek"] = df_clean[col].dt.dayofweek
    # ==========================================
    # H) Visualization
    # ==========================================
    plot_missingness(df)
   
    num_cols_updated, cat_cols_updated, _ = detect_column_types(df_clean)
   
    # Univariate Numerical
    plot_univariate_num(df_clean, num_cols_updated)
   
    # Univariate Categorical
    plot_univariate_cat(df_clean, cat_cols)
   
    # Bivariate
    plot_correlation(df_clean, num_cols_updated)
   
    # Target-Aware Analysis (if TARGET_COL provided)
    if TARGET_COL and TARGET_COL in df_clean.columns:
        print(f"\nTarget Analysis: {TARGET_COL}")
        if TARGET_COL in num_cols_updated:
            # Numeric Target
            target_corr = df_clean[num_cols_updated].corr()[TARGET_COL].sort_values(ascending=False)
            print("Correlations with Target:")
            print(target_corr)
        else:
            # Categorical Target
            plt.figure(figsize=(10, 5))
            sns.countplot(data=df_clean, x=TARGET_COL, hue=TARGET_COL, palette='magma', legend=False)
            plt.title(f"Target Distribution: {TARGET_COL}")
            plt.xticks(rotation=45)
            plt.show()
    # ==========================================
    # I) Final Artifact Output
    # ==========================================
    print("\n--- FINAL SUMMARY ---")
    print(f"Original Shape: {df.shape}")
    print(f"Cleaned Shape:  {df_clean.shape}")
    print(f"Duplicates removed: {df.duplicated().sum()}")
    print(f"Columns processed: {list(df_clean.columns)}")
    print("\nProcessed Data Preview (df_clean.head()):")
    print(df_clean.head())
else:
    print("DataFrame is empty. Pipeline terminated.")


[[Категория:Работы ИНДОР-211]]
[[Категория:Работы ИНДОР-211]]
[[Категория:BigDataWorks]]
[[Категория:BigDataWorks]]

Версия от 12:54, 14 апреля 2026


Описание модели Коллекция песен из индийского кинематографа
Область знаний Информатика, Образование, Искусственный интеллект, Большие данные, Музыка, Медиа
Веб-страница - ссылка на модель https://www.kaggle.com/datasets/moonknightmarvel/dataset-of-songs-with-genreartistmovielanguage/data
Видео запись
Разработчики Pocrovskii Alexander
Среды и средства, в которых реализована модель R, Большие данные
Диаграмма модели
Описание полей данных, которые модель порождает
Модель создана студентами? Да

Общая информация

  • Авторы: Студент группы - Pokrovskii Alexander
  • Дата исследования: 14 апреля 2026
  • Источник: Kaggle Datasets
  • Платформа: Kaggle
  • Дата публикации: 23 апреля 2026 г.

Исходные данные

Описание исследования

Исследование посвящено анализу структурированных музыкальных метаданных на примере датасета песен из индийских фильмов.

Цель

Выявить статистически значимые связи между метаданными песен (язык, исполнитель, фильм) и их эмоциональной категорией, а также построить и валидировать модель машинного обучения для прогнозирования эмоции песни на основе доступных признаков с точностью не ниже 75% (F1-macro).

Задачи

  1. Выполнить предобработку: кодирование категориальных признаков (Artist, Movie, Language), балансировку данных (при необходимости), разделение на обучающую/тестовую выборки.
  2. Выполнить предобработку: кодирование категориальных признаков (Artist, Movie, Language), балансировку данных (при необходимости), разделение на обучающую/тестовую выборки.
  3. Построить и сравнить несколько моделей классификации (логистическая регрессия, Random Forest, XGBoost) с кросс-валидацией, оценить метрики качества (accuracy, precision, recall, F1-score).
  4. Визуализировать результаты: матрицу ошибок, важность признаков, распределение предсказаний, а также сформировать интерпретируемые выводы о доминирующих факторах, влияющих на эмоциональную окраску песни.

Гипотеза

Эмоциональная категория песни (Emotion) статистически значимо зависит от комбинации языка исполнения и исполнителя: песни на телугу в исполнении артистов «первого эшелона» (например, Sid Sriram, Armaan Malik) с большей вероятностью относятся к категориям Love или Joy, тогда как треки второстепенных исполнителей или из менее популярных фильмов чаще маркируются как Sadness или Anticipation. При этом модель, обученная на признаках Language + Artist + Movie, покажет качество прогнозирования эмоции выше базового уровня (majority class baseline) не менее чем на 20 п.п. по метрике F1-macro.


Программный код

<syntaxhighlight lang="R">

  1. Анализ БД

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import re

  1. ==========================================
  2. A) Imports + Global Config
  3. ==========================================

np.random.seed(42) pd.set_option('display.max_columns', 50) pd.set_option('display.width', 1000) pd.set_option('display.float_format', '{:.2f}'.format)

plt.rcParams['figure.figsize'] = (12, 6) plt.rcParams['axes.titlesize'] = 14 plt.rcParams['axes.labelsize'] = 12 sns.set_theme(style="whitegrid")

  1. Provided Inputs

DATASET_NAME = "moonknightmarvel/dataset-of-songs-with-genreartistmovielanguage" EXACT_COLUMNS = ["title", "artist", "movie", "language", "emotion"] TARGET_COL = None

  1. ==========================================
  2. B) Helper Functions (Robust & Defensive)
  3. ==========================================

def safe_read_csv(path):

   try:
       # DATA_DIR is assumed to be defined in the environment per instructions
       return pd.read_csv(path)
   except Exception as e:
       print(f"CRITICAL ERROR: Could not read CSV at {path}. Error: {e}")
       return pd.DataFrame()

def validate_columns(df, expected_cols):

   if df.empty or not expected_cols:
       return
   actual_cols = df.columns.tolist()
   missing = [c for c in expected_cols if c not in actual_cols]
   extra = [c for c in actual_cols if c not in expected_cols]
   
   print("-" * 30)
   print(f"COLUMN VALIDATION: {DATASET_NAME}")
   if not missing and not extra:
       print("Success: All expected columns found. No extra columns.")
   else:
       if missing: print(f"Missing expected columns: {missing}")
       if extra: print(f"Extra columns found: {extra}")
   print("-" * 30)

def audit_missingness(df):

   null_counts = df.isnull().sum()
   null_pct = (null_counts / len(df)) * 100
   non_null = df.notnull().sum()
   audit = pd.DataFrame({
       'Null Count': null_counts,
       'Null %': null_pct,
       'Non-Null Count': non_null
   })
   return audit

def detect_column_types(df):

   num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
   cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
   date_cols = []
   
   # Simple heuristic for potential date columns
   for col in cat_cols:
       sample = df[col].dropna().head(5).astype(str)
       if any(sample.str.contains(r'\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}', regex=True)):
           date_cols.append(col)
           
   # Refine cat_cols (remove dates)
   cat_cols = [c for c in cat_cols if c not in date_cols]
   return num_cols, cat_cols, date_cols

def safe_to_numeric(series):

   try:
       return pd.to_numeric(series, errors='coerce')
   except:
       return series

def safe_to_datetime(series):

   try:
       return pd.to_datetime(series, errors='coerce')
   except:
       return series

def plot_missingness(df):

   null_pct = (df.isnull().sum() / len(df)) * 100
   null_pct = null_pct[null_pct > 0].sort_values(ascending=False).head(30)
   if not null_pct.empty:
       sns.barplot(x=null_pct.values, y=null_pct.index, hue=null_pct.index, palette='Reds_r', legend=False)
       plt.title("Top Columns by Missing Percentage (%)")
       plt.xlabel("% Missing")
       plt.show()

def plot_univariate_num(df, num_cols):

   cols_to_plot = num_cols[:12]
   if not cols_to_plot: return
   n = len(cols_to_plot)
   rows = (n + 2) // 3
   fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))
   axes = axes.flatten()
   for i, col in enumerate(cols_to_plot):
       sns.histplot(df[col].dropna(), kde=True, ax=axes[i], color='teal')
       axes[i].set_title(f"Distribution of {col}")
   for j in range(i + 1, len(axes)):
       fig.delaxes(axes[j])
   plt.tight_layout()
   plt.show()

def plot_univariate_cat(df, cat_cols):

   cols_to_plot = cat_cols[:6]
   if not cols_to_plot: return
   for col in cols_to_plot:
       if df[col].nunique() > 50:
           top_vals = df[col].value_counts().head(15)
       else:
           top_vals = df[col].value_counts().head(15)
       
       plt.figure(figsize=(10, 5))
       sns.barplot(x=top_vals.values, y=top_vals.index, hue=top_vals.index, palette='viridis', legend=False)
       plt.title(f"Top 15 Categories: {col}")
       plt.show()

def plot_correlation(df, num_cols):

   if len(num_cols) < 2: return
   corr = df[num_cols].corr()
   plt.figure(figsize=(10, 8))
   sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
   plt.title("Pearson Correlation Matrix")
   plt.show()
  1. ==========================================
  2. C) Load + Validate
  3. ==========================================

df = safe_read_csv(DATA_DIR) validate_columns(df, EXACT_COLUMNS)

if not df.empty:

   # ==========================================
   # D) Data Audit
   # ==========================================
   print(f"\n--- DATA AUDIT: {DATASET_NAME} ---")
   print(f"Shape: {df.shape}")
   print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
   print(f"Duplicates: {df.duplicated().sum()}")
   
   null_audit = audit_missingness(df)
   print("\nNull Audit Summary (Top 5 Missing):")
   print(null_audit.sort_values(by='Null %', ascending=False).head(5))
   
   num_cols, cat_cols, date_cols = detect_column_types(df)
   
   print(f"\nDetected Numeric Columns: {num_cols}")
   print(f"Detected Categorical Columns: {cat_cols}")
   print(f"Detected Date Columns: {date_cols}")
   
   # Numeric Stability
   if num_cols:
       inf_counts = np.isinf(df[num_cols]).sum().sum()
       print(f"Total Inf/-Inf values: {inf_counts}")
       low_var = [c for c in num_cols if df[c].std() < 0.01]
       if low_var: print(f"Low variance columns: {low_var}")
   # ==========================================
   # E) ETL (Safe + Reversible)
   # ==========================================
   df_clean = df.copy()
   
   # Strip whitespace & Unify Missing Tokens
   missing_tokens = ["", "NA", "N/A", "null", "None", "nan"]
   for col in df_clean.columns:
       if df_clean[col].dtype == 'object':
           df_clean[col] = df_clean[col].astype(str).str.strip()
           df_clean[col] = df_clean[col].replace(missing_tokens, np.nan)
   
   # Attempt to convert object columns to numeric if they are high-signal
   for col in cat_cols:
       sample = df_clean[col].dropna().head(10)
       if sample.str.match(r'^-?\d+(\.\d+)?$').all():
           df_clean[col] = safe_to_numeric(df_clean[col])
   
   # Handle Duplicates
   df_clean = df_clean.drop_duplicates(keep='first')
   
   # Recalculate types after cleaning
   num_cols, cat_cols, date_cols = detect_column_types(df_clean)
   
   # Missing Value Handling (Imputation + Indicators)
   for col in num_cols:
       if df_clean[col].isnull().any():
           df_clean[f"{col}__was_missing"] = df_clean[col].isnull().astype(int)
           df_clean[col] = df_clean[col].fillna(df_clean[col].median())
           
   for col in cat_cols:
       if df_clean[col].isnull().any():
           df_clean[f"{col}__was_missing"] = df_clean[col].isnull().astype(int)
           df_clean[col] = df_clean[col].fillna("Missing")
   # ==========================================
   # F) EDA (Univariate & Bivariate)
   # ==========================================
   if num_cols:
       print("\nNumeric Distribution Summary:")
       stats = df_clean[num_cols].agg(['mean', 'std', 'min', 'median', 'max', 'skew'])
       print(stats.T)
       
       if len(num_cols) >= 2:
           high_corr = []
           corr_mat = df_clean[num_cols].corr().abs()
           upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
           for col in upper.columns:
               for row in upper.index:
                   if upper.loc[row, col] >= 0.85:
                       high_corr.append((row, col, upper.loc[row, col]))
           if high_corr:
               print("\nHighly Correlated Pairs (|r| >= 0.85):")
               for r, c, v in high_corr: print(f" - {r} & {c}: {v:.3f}")
   # ==========================================
   # G) Feature Engineering (Lightweight)
   # ==========================================
   # String Lengths for Title/Artist etc.
   text_candidates = [c for c in cat_cols if c in ['title', 'artist', 'movie', 'language']]
   for col in text_candidates:
       if col in df_clean.columns:
           df_clean[f"{col}__len"] = df_clean[col].astype(str).apply(len)
           df_clean[f"{col}__words"] = df_clean[col].astype(str).apply(lambda x: len(x.split()))
           
   # Datetime expansion
   for col in date_cols:
       df_clean[col] = safe_to_datetime(df_clean[col])
       if not df_clean[col].isnull().all():
           df_clean[f"{col}__year"] = df_clean[col].dt.year
           df_clean[f"{col}__month"] = df_clean[col].dt.month
           df_clean[f"{col}__dayofweek"] = df_clean[col].dt.dayofweek
   # ==========================================
   # H) Visualization
   # ==========================================
   plot_missingness(df)
   
   num_cols_updated, cat_cols_updated, _ = detect_column_types(df_clean)
   
   # Univariate Numerical
   plot_univariate_num(df_clean, num_cols_updated)
   
   # Univariate Categorical
   plot_univariate_cat(df_clean, cat_cols)
   
   # Bivariate
   plot_correlation(df_clean, num_cols_updated)
   
   # Target-Aware Analysis (if TARGET_COL provided)
   if TARGET_COL and TARGET_COL in df_clean.columns:
       print(f"\nTarget Analysis: {TARGET_COL}")
       if TARGET_COL in num_cols_updated:
           # Numeric Target
           target_corr = df_clean[num_cols_updated].corr()[TARGET_COL].sort_values(ascending=False)
           print("Correlations with Target:")
           print(target_corr)
       else:
           # Categorical Target
           plt.figure(figsize=(10, 5))
           sns.countplot(data=df_clean, x=TARGET_COL, hue=TARGET_COL, palette='magma', legend=False)
           plt.title(f"Target Distribution: {TARGET_COL}")
           plt.xticks(rotation=45)
           plt.show()
   # ==========================================
   # I) Final Artifact Output
   # ==========================================
   print("\n--- FINAL SUMMARY ---")
   print(f"Original Shape: {df.shape}")
   print(f"Cleaned Shape:  {df_clean.shape}")
   print(f"Duplicates removed: {df.duplicated().sum()}")
   print(f"Columns processed: {list(df_clean.columns)}")
   print("\nProcessed Data Preview (df_clean.head()):")
   print(df_clean.head())

else:

   print("DataFrame is empty. Pipeline terminated.")