Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import sqlite3 | |
| # Load the Anime Dataset as a Pandas DataFrame | |
| df = pd.read_csv('data/anime-dataset-2023.csv') | |
| # Separate the DataFrame into separate tables | |
| # Genre Table | |
| # Create a separate DataFrame using Genres column | |
| genres_df = df['Genres'].str.split(', ').explode().str.strip().drop_duplicates().reset_index(drop=True) | |
| genres_df = pd.DataFrame({'genre_id': range(1, len(genres_df) + 1), 'genre_name': genres_df}) | |
| # Exclude some Genres | |
| excluded_genres = ['Hentai', 'UNKNOWN', 'Erotica', 'Ecchi'] | |
| genres_df = genres_df[~genres_df['genre_name'].isin(excluded_genres)] | |
| # Anime Table | |
| # Filter out animes that contain excluded genres | |
| pattern = '|'.join(excluded_genres) | |
| anime_df = df[~df['Genres'].str.contains(pattern, na=True)] | |
| # Filter out animes without English Names | |
| anime_df = anime_df[anime_df['English name'] != 'UNKNOWN'] | |
| # Filter out animes without Scores | |
| anime_df = anime_df[anime_df['Score'] != 'UNKNOWN'] | |
| # Filter out animes without Synopsis | |
| anime_df = anime_df[anime_df['Synopsis'] != 'No description available for this anime.'] | |
| # Sort by Score and Keep only Top 1000 Animes | |
| anime_df = anime_df.sort_values(by='Score', ascending=False) | |
| anime_df = anime_df.head(1000) | |
| # Rename Columns | |
| anime_df = anime_df.rename(columns={'English name': 'name', 'Score': 'score', 'Synopsis': 'synopsis'}) | |
| # AnimeGenre Table | |
| # Create a mapping of genre_name to genre_id | |
| genre_mapping = genres_df.set_index('genre_name')['genre_id'].to_dict() | |
| # Explode genres for filtered animes and map to genre_ids | |
| anime_genre_df = anime_df[['anime_id', 'Genres']].copy() | |
| anime_genre_df = anime_genre_df.assign(genre_name=anime_genre_df['Genres'].str.split(', ')).explode('genre_name') | |
| anime_genre_df['genre_name'] = anime_genre_df['genre_name'].str.strip() | |
| anime_genre_df['genre_id'] = anime_genre_df['genre_name'].map(genre_mapping) | |
| anime_genre_df = anime_genre_df[['anime_id', 'genre_id']].dropna() | |
| anime_genre_df['genre_id'] = anime_genre_df['genre_id'].astype(int) | |
| # Final Clean Up | |
| # Keep only anime_id, Name, Score and Synopsis Columns | |
| anime_df = anime_df[['anime_id', 'name', 'score', 'synopsis']] | |
| anime_df = anime_df.rename(columns={'anime_id': 'id'}) | |
| genres_df = genres_df.rename(columns={'genre_id': 'id'}) | |
| SCHEMA_SQL = ''' | |
| PRAGMA foreign_keys = ON; | |
| CREATE TABLE IF NOT EXISTS Anime ( | |
| id INTEGER PRIMARY KEY, | |
| name VARCHAR(50), | |
| score FLOAT, | |
| synopsis TEXT | |
| ); | |
| CREATE TABLE IF NOT EXISTS Genre ( | |
| id INTEGER PRIMARY KEY, | |
| genre_name VARCHAR(20) | |
| ); | |
| CREATE TABLE IF NOT EXISTS AnimeGenre ( | |
| anime_id INTEGER NOT NULL, | |
| genre_id INTEGER NOT NULL, | |
| PRIMARY KEY (anime_id, genre_id), | |
| FOREIGN KEY (anime_id) REFERENCES Anime(id) ON DELETE CASCADE ON UPDATE CASCADE, | |
| FOREIGN KEY (genre_id) REFERENCES Genre(id) ON DELETE CASCADE ON UPDATE CASCADE | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_anime_id ON AnimeGenre(anime_id); | |
| CREATE INDEX IF NOT EXISTS idx_genre_id ON AnimeGenre(genre_id); | |
| ''' | |
| with sqlite3.connect('anime.db') as conn: | |
| conn.executescript(SCHEMA_SQL) | |
| anime_df.to_sql('Anime', conn, if_exists='delete_rows', index=False, method='multi') | |
| genres_df.to_sql('Genre', conn, if_exists='delete_rows', index=False, method='multi') | |
| anime_genre_df.to_sql('AnimeGenre', conn, if_exists='delete_rows', index=False, method='multi') | |