junchenfu commited on
Commit
e402584
·
verified ·
1 Parent(s): ecc1332

Upload pipline.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. pipline.py +17 -19
pipline.py CHANGED
@@ -33,20 +33,24 @@ def generate_rag_prompt(user_prompt,
33
  set_random_seed(seed)
34
 
35
  # 2. Load various data files
36
- views_data = pd.read_csv('Microlens/MicroLens-100k_likes_and_views.txt',
37
  sep='\t', header=None, names=['video_id','likes','views'])
38
- title_data = pd.read_csv('Microlens/MicroLens-100k_title_en.csv',
39
- sep=',', header=None, names=['video_id','title_en'])
40
- cover_data = pd.read_csv('Microlens/llava-v1.5_caption.txt',
41
- sep=',', header=None, names=['video_id','cover_desc'])
42
- desc_data = pd.read_csv('Microlens/Microlens100K_captions_en.csv',
43
- sep='\t', header=None, names=['video_id','caption_en'])
 
 
44
  tags_data = pd.read_csv('Microlens/tags_to_summary.csv',
45
- sep=',', header=None, names=['video_id','partition'])
46
-
 
47
  # 3. Load comment data and count comments for each video_id
48
  comments_data = pd.read_csv('Microlens/MicroLens-100k_comment_en.txt',
49
- sep='\t', header=None, names=['user_id','video_id','comment_text'])
 
50
  comments_data = comments_data[['video_id','comment_text']]
51
 
52
  # Group by video_id and count comments
@@ -57,23 +61,17 @@ def generate_rag_prompt(user_prompt,
57
  .reset_index(name='comment_count')
58
  )
59
 
60
- # Merge all data
61
  merged = (
62
  views_data
63
- .merge(title_data, on='video_id', how='left')
64
  .merge(cover_data, on='video_id', how='left')
65
  .merge(desc_data, on='video_id', how='left')
66
  .merge(tags_data, on='video_id', how='left')
67
  .merge(comment_count_df, on='video_id', how='left')
68
  )
 
 
69
 
70
- # Load test set IDs
71
- test_id_data = pd.read_csv('/MicroLens/test_id.csv',
72
- sep=',', header=None, names=['video_id'])
73
-
74
- # Perform inner join with the merged dataframe on 'video_id'
75
- merged = merged.merge(test_id_data, on='video_id', how='inner')
76
-
77
  # Drop rows with missing values in key fields
78
  merged.dropna(subset=['title_en', 'cover_desc', 'caption_en', 'partition', 'comment_count'], inplace=True)
79
 
 
33
  set_random_seed(seed)
34
 
35
  # 2. Load various data files
36
+ views_data = pd.read_csv('Microlens/MicroLens-100k_likes_and_views.txt',
37
  sep='\t', header=None, names=['video_id','likes','views'])
38
+ # MicroLens-100k_title_en.csv contains cover image descriptions (cover_desc)
39
+ cover_data = pd.read_csv('Microlens/MicroLens-100k_title_en.csv',
40
+ sep=',', header=None, names=['video_id','cover_desc'],
41
+ on_bad_lines='skip')
42
+ # Microlens100K_captions_en.csv contains video captions; use as title_en and caption_en
43
+ desc_data = pd.read_csv('Microlens/Microlens100K_captions_en.csv',
44
+ sep='\t', header=None, names=['video_id','caption_en'],
45
+ on_bad_lines='skip')
46
  tags_data = pd.read_csv('Microlens/tags_to_summary.csv',
47
+ sep=',', header=None, names=['video_id','partition'],
48
+ on_bad_lines='skip')
49
+
50
  # 3. Load comment data and count comments for each video_id
51
  comments_data = pd.read_csv('Microlens/MicroLens-100k_comment_en.txt',
52
+ sep='\t', header=None, names=['user_id','video_id','comment_text'],
53
+ on_bad_lines='skip')
54
  comments_data = comments_data[['video_id','comment_text']]
55
 
56
  # Group by video_id and count comments
 
61
  .reset_index(name='comment_count')
62
  )
63
 
64
+ # Merge all data (use cover_desc as title_en since no separate title file is available)
65
  merged = (
66
  views_data
 
67
  .merge(cover_data, on='video_id', how='left')
68
  .merge(desc_data, on='video_id', how='left')
69
  .merge(tags_data, on='video_id', how='left')
70
  .merge(comment_count_df, on='video_id', how='left')
71
  )
72
+ # Use cover_desc as title_en for compatibility with downstream prompts
73
+ merged['title_en'] = merged['cover_desc']
74
 
 
 
 
 
 
 
 
75
  # Drop rows with missing values in key fields
76
  merged.dropna(subset=['title_en', 'cover_desc', 'caption_en', 'partition', 'comment_count'], inplace=True)
77