Upload pipline.py with huggingface_hub
Browse files- pipline.py +17 -19
pipline.py
CHANGED
|
@@ -33,20 +33,24 @@ def generate_rag_prompt(user_prompt,
|
|
| 33 |
set_random_seed(seed)
|
| 34 |
|
| 35 |
# 2. Load various data files
|
| 36 |
-
views_data = pd.read_csv('Microlens/MicroLens-100k_likes_and_views.txt',
|
| 37 |
sep='\t', header=None, names=['video_id','likes','views'])
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
tags_data = pd.read_csv('Microlens/tags_to_summary.csv',
|
| 45 |
-
sep=',', header=None, names=['video_id','partition']
|
| 46 |
-
|
|
|
|
| 47 |
# 3. Load comment data and count comments for each video_id
|
| 48 |
comments_data = pd.read_csv('Microlens/MicroLens-100k_comment_en.txt',
|
| 49 |
-
sep='\t', header=None, names=['user_id','video_id','comment_text']
|
|
|
|
| 50 |
comments_data = comments_data[['video_id','comment_text']]
|
| 51 |
|
| 52 |
# Group by video_id and count comments
|
|
@@ -57,23 +61,17 @@ def generate_rag_prompt(user_prompt,
|
|
| 57 |
.reset_index(name='comment_count')
|
| 58 |
)
|
| 59 |
|
| 60 |
-
# Merge all data
|
| 61 |
merged = (
|
| 62 |
views_data
|
| 63 |
-
.merge(title_data, on='video_id', how='left')
|
| 64 |
.merge(cover_data, on='video_id', how='left')
|
| 65 |
.merge(desc_data, on='video_id', how='left')
|
| 66 |
.merge(tags_data, on='video_id', how='left')
|
| 67 |
.merge(comment_count_df, on='video_id', how='left')
|
| 68 |
)
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
# Load test set IDs
|
| 71 |
-
test_id_data = pd.read_csv('/MicroLens/test_id.csv',
|
| 72 |
-
sep=',', header=None, names=['video_id'])
|
| 73 |
-
|
| 74 |
-
# Perform inner join with the merged dataframe on 'video_id'
|
| 75 |
-
merged = merged.merge(test_id_data, on='video_id', how='inner')
|
| 76 |
-
|
| 77 |
# Drop rows with missing values in key fields
|
| 78 |
merged.dropna(subset=['title_en', 'cover_desc', 'caption_en', 'partition', 'comment_count'], inplace=True)
|
| 79 |
|
|
|
|
| 33 |
set_random_seed(seed)
|
| 34 |
|
| 35 |
# 2. Load various data files
|
| 36 |
+
views_data = pd.read_csv('Microlens/MicroLens-100k_likes_and_views.txt',
|
| 37 |
sep='\t', header=None, names=['video_id','likes','views'])
|
| 38 |
+
# MicroLens-100k_title_en.csv contains cover image descriptions (cover_desc)
|
| 39 |
+
cover_data = pd.read_csv('Microlens/MicroLens-100k_title_en.csv',
|
| 40 |
+
sep=',', header=None, names=['video_id','cover_desc'],
|
| 41 |
+
on_bad_lines='skip')
|
| 42 |
+
# Microlens100K_captions_en.csv contains video captions; use as title_en and caption_en
|
| 43 |
+
desc_data = pd.read_csv('Microlens/Microlens100K_captions_en.csv',
|
| 44 |
+
sep='\t', header=None, names=['video_id','caption_en'],
|
| 45 |
+
on_bad_lines='skip')
|
| 46 |
tags_data = pd.read_csv('Microlens/tags_to_summary.csv',
|
| 47 |
+
sep=',', header=None, names=['video_id','partition'],
|
| 48 |
+
on_bad_lines='skip')
|
| 49 |
+
|
| 50 |
# 3. Load comment data and count comments for each video_id
|
| 51 |
comments_data = pd.read_csv('Microlens/MicroLens-100k_comment_en.txt',
|
| 52 |
+
sep='\t', header=None, names=['user_id','video_id','comment_text'],
|
| 53 |
+
on_bad_lines='skip')
|
| 54 |
comments_data = comments_data[['video_id','comment_text']]
|
| 55 |
|
| 56 |
# Group by video_id and count comments
|
|
|
|
| 61 |
.reset_index(name='comment_count')
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# Merge all data (use cover_desc as title_en since no separate title file is available)
|
| 65 |
merged = (
|
| 66 |
views_data
|
|
|
|
| 67 |
.merge(cover_data, on='video_id', how='left')
|
| 68 |
.merge(desc_data, on='video_id', how='left')
|
| 69 |
.merge(tags_data, on='video_id', how='left')
|
| 70 |
.merge(comment_count_df, on='video_id', how='left')
|
| 71 |
)
|
| 72 |
+
# Use cover_desc as title_en for compatibility with downstream prompts
|
| 73 |
+
merged['title_en'] = merged['cover_desc']
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Drop rows with missing values in key fields
|
| 76 |
merged.dropna(subset=['title_en', 'cover_desc', 'caption_en', 'partition', 'comment_count'], inplace=True)
|
| 77 |
|