| """
|
| Dataset Generation Pipeline for TinyBert-CNN Intent Classifier.
|
| Generates (student_input, session_context, label) triples for 5-class classification.
|
| """
|
|
|
| import random
|
| import pandas as pd
|
| import os
|
| import re
|
|
|
|
|
|
|
|
|
|
|
| PYTHON_TOPICS = [
|
| "Variables and Data Types",
|
| "Strings and Formatting",
|
| "Arithmetic Operators",
|
| "Boolean Logic",
|
| "If/Else Conditionals",
|
| "For Loops",
|
| "While Loops",
|
| "Lists and Tuples",
|
| "Dictionaries",
|
| "Sets",
|
| "Functions and Scope",
|
| "Lambda Functions",
|
| "Error Handling (Try/Except)",
|
| "Classes and OOP",
|
| "File Handling"
|
| ]
|
|
|
| LABEL_MAP = {
|
| 'On-Topic Question': 0,
|
| 'Off-Topic Question': 1,
|
| 'Emotional-State': 2,
|
| 'Pace-Related': 3,
|
| 'Repeat/clarification': 4
|
| }
|
|
|
| EMOTIONS = ["neutral", "engaged", "focused", "frustrated", "confused", "bored", "tired", "anxious", "excited", "overwhelmed"]
|
| PACES = ["normal", "fast", "slow", "rushed", "dragging", "moderate", "steady"]
|
|
|
|
|
|
|
|
|
|
|
| def generate_session_context(current_topic_idx):
|
| """Generates a compact session context string."""
|
| current_topic = PYTHON_TOPICS[current_topic_idx]
|
|
|
| if current_topic_idx > 0:
|
| prev_count = min(3, current_topic_idx)
|
| prev_topics = PYTHON_TOPICS[current_topic_idx - prev_count : current_topic_idx]
|
| else:
|
| prev_topics = []
|
|
|
|
|
| abilities = []
|
| for pt in prev_topics:
|
| short_name = pt.split("(")[0].strip().replace(" and ", "&")
|
| score = random.randint(30, 100)
|
| abilities.append(f"{short_name}:{score}%")
|
|
|
| ability_str = ",".join(abilities) if abilities else "N/A"
|
| prev_str = ",".join([t.split("(")[0].strip() for t in prev_topics]) if prev_topics else "None"
|
| emotion = random.choice(EMOTIONS)
|
| pace = random.choice(PACES)
|
| slide = random.randint(5, 60)
|
|
|
| context = (
|
| f"topic:{current_topic} | "
|
| f"prev:{prev_str} | "
|
| f"ability:{ability_str} | "
|
| f"emotion:{emotion} | "
|
| f"pace:{pace} | "
|
| f"slides:{slide-1},{slide},{slide+1}"
|
| )
|
| return context, current_topic, prev_topics
|
|
|
|
|
|
|
|
|
|
|
|
|
| ON_TOPIC_TEMPLATES = [
|
|
|
| "How do I use {topic} in my code?",
|
| "Can you explain {topic} again?",
|
| "What are the best practices for {topic}?",
|
| "Can you show me an example of {topic}?",
|
| "Why is {topic} giving me a syntax error?",
|
| "Is there a different way to write {topic}?",
|
| "I don't get the part about {topic}.",
|
| "Can we do another exercise for {topic}?",
|
| "What happens if I forget to close the bracket in {topic}?",
|
| "How is {topic} different from the previous topic?",
|
|
|
| "Why do we need {topic}?",
|
| "When should I use {topic} vs the other approach?",
|
| "What's the point of {topic}?",
|
| "Is {topic} used a lot in real projects?",
|
| "Can you give me a real-world example of {topic}?",
|
| "Does {topic} work the same way in other languages?",
|
|
|
| "I'm stuck on this challenge about {topic}.",
|
| "My code for {topic} isn't working, can you help?",
|
| "I keep getting an error with {topic}.",
|
| "Why does my {topic} code print the wrong output?",
|
| "What am I doing wrong with {topic}?",
|
| "Can you debug this {topic} example with me?",
|
|
|
| "What did you mean when you said {topic} works like that?",
|
| "Can you go deeper into {topic}?",
|
| "Is there more to know about {topic}?",
|
| "How does {topic} connect to what we learned before?",
|
| "What's the difference between the two approaches you showed for {topic}?",
|
| "Can you break down {topic} step by step?",
|
|
|
| "How would I use {topic} in a project?",
|
| "Can I combine {topic} with what we learned earlier?",
|
| "Is {topic} something I'll use every day?",
|
| "Where does {topic} fit in a larger program?",
|
| "Can you show me a more advanced use of {topic}?",
|
|
|
| "Tell me more about {topic}",
|
| "What's {topic} again?",
|
| "{topic} is confusing",
|
| "Help me with {topic}",
|
| "I need help understanding {topic}",
|
| "So how does {topic} actually work?",
|
| "Wait, explain {topic} one more time",
|
| ]
|
|
|
|
|
| ON_TOPIC_CONTEXT_TEMPLATES = [
|
| "You said I scored low on {prev_topic}, does that affect how I should approach {topic}?",
|
| "Since I did well on {prev_topic}, is {topic} going to be similar?",
|
| "How does {prev_topic} relate to {topic}?",
|
| "I understood {prev_topic} but {topic} feels completely different, why?",
|
| "Can we review {prev_topic} briefly before diving deeper into {topic}?",
|
| "My score on {prev_topic} was not great, will I need it for {topic}?",
|
| ]
|
|
|
| OFF_TOPIC_GENERAL = [
|
| "What's the weather like today?",
|
| "How do I cook pasta?",
|
| "Who won the soccer match last night?",
|
| "Can you recommend a good movie to watch?",
|
| "What is the capital of France?",
|
| "How much does a new car cost?",
|
| "Do you like listening to music?",
|
| "Tell me a joke.",
|
| "I'm feeling hungry, should I order pizza?",
|
| "What is your favorite color?",
|
| "What time is it?",
|
| "Do you know any good restaurants nearby?",
|
| "Who is the president of the United States?",
|
| "What's the best phone to buy right now?",
|
| "Can you help me with my math homework?",
|
| "How tall is the Eiffel Tower?",
|
| "What should I eat for dinner?",
|
| "Do you watch Netflix?",
|
| "What's the meaning of life?",
|
| "How do I fix my car?",
|
| ]
|
|
|
| OFF_TOPIC_FUTURE_TOPIC_TEMPLATES = [
|
| "Are we going to learn about {topic} soon?",
|
| "What is {topic} exactly?",
|
| "I heard about {topic}, can you explain it to me?",
|
| "How does {topic} work in Python?",
|
| "Can we skip ahead to {topic}?",
|
| "Is {topic} hard to learn?",
|
| "I saw someone using {topic}, what does it do?",
|
| "Do we need to know about {topic}?",
|
| "When will we cover {topic}?",
|
| "My friend told me {topic} is important, is that true?",
|
| "Will {topic} be on the exam?",
|
| "Can you give me a sneak peek of {topic}?",
|
| "I already know a bit about {topic}, can we jump to it?",
|
| "How long until we get to {topic}?",
|
| "Is {topic} related to what we are doing now?",
|
| ]
|
|
|
| EMOTIONAL_TEMPLATES = [
|
|
|
| "I am so frustrated right now.",
|
| "This is making me really angry.",
|
| "I can't take this anymore.",
|
| "I feel like giving up.",
|
| "Nothing makes sense to me.",
|
| "I'm losing my patience.",
|
| "Why is this so hard?",
|
| "I feel stupid for not getting this.",
|
|
|
| "This is really starting to make sense!",
|
| "I love coding, this is fun!",
|
| "Wow, I finally understand it!",
|
| "I am ready to tackle the next challenge!",
|
| "This is getting exciting!",
|
| "I feel so good about this now.",
|
| "I'm having a great time learning this.",
|
| "That was actually easier than I thought.",
|
|
|
| "I feel completely stuck and confused.",
|
| "I have no idea what's going on.",
|
| "My brain is fried.",
|
| "I'm lost.",
|
| "I don't understand anything.",
|
| "This is so confusing it hurts.",
|
|
|
| "This is getting boring.",
|
| "I'm feeling super tired today.",
|
| "My head hurts from all this information.",
|
| "I feel like I'm not making any progress.",
|
| "Can we do something more interesting?",
|
| "I'm so sleepy right now.",
|
| "This is not engaging at all.",
|
| "My eyes are glazing over.",
|
|
|
| "I'm nervous about the upcoming test.",
|
| "What if I fail?",
|
| "I feel anxious about falling behind.",
|
| "Everyone else seems to get it except me.",
|
| "I'm stressed out.",
|
|
|
| "I'm confused, I feel so dumb right now.",
|
| "I'm excited but also scared I'll mess up.",
|
| "I'm frustrated because this used to make sense.",
|
| "I feel overwhelmed by all this new stuff.",
|
| "I just feel really down today.",
|
| ]
|
|
|
| PACE_TEMPLATES = [
|
|
|
| "Can we slow down a bit?",
|
| "You are going way too fast.",
|
| "Wait, can you slow down the explanation?",
|
| "I need more time to process this.",
|
| "Can you wait a second before moving to the next slide?",
|
| "Hold on, I'm still writing notes.",
|
| "Please slow down, I can't keep up.",
|
| "You're moving too quickly for me.",
|
| "I need a moment to think about this.",
|
| "Can we pause for a minute?",
|
| "Don't rush through this please.",
|
| "Slow down, I'm still on the last example.",
|
| "Give me a sec, I'm still processing.",
|
|
|
| "Let's move on to the next topic.",
|
| "Can we skip this?",
|
| "I think I got this, let's speed up.",
|
| "Can we go through the next part faster?",
|
| "Let's speed up the pace, I'm bored.",
|
| "I already know this, can we move on?",
|
| "This part is easy, let's go faster.",
|
| "Skip ahead please.",
|
| "Next topic please.",
|
| "We're spending too long on this.",
|
| "Can we pick up the pace?",
|
|
|
| "Can we take a break?",
|
| "How much time do we have left?",
|
| "When does this session end?",
|
| "I need a 5 minute break.",
|
| "Let's take a quick breather.",
|
|
|
| "The pace feels about right.",
|
| "Can you adjust the speed a bit?",
|
| "I think the pacing is off.",
|
| "Are we on schedule?",
|
| "How many more slides do we have?",
|
| ]
|
|
|
| REPEAT_TEMPLATES = [
|
| "Can you repeat that last part?",
|
| "What did you say about the slide right before this one?",
|
| "Could you clarify what you meant?",
|
| "I didn't catch that, can you say it again?",
|
| "Say that again?",
|
| "Can you go back to the previous slide for a second?",
|
| "I missed the first step, can you re-explain?",
|
| "Can you repeat the rule for that?",
|
| "Could you run through the explanation one more time?",
|
| "Can you clarify the difference between the two examples?",
|
| "Wait, what was that?",
|
| "Huh? Can you repeat?",
|
| "I didn't understand, please say it again.",
|
| "Sorry, I zoned out. What did you just say?",
|
| "Come again?",
|
| "Can you show that example one more time?",
|
| "Go back to that last point please.",
|
| "I need you to repeat the definition.",
|
| "What was the syntax you just showed?",
|
| "Can you re-explain how that works?",
|
| "I lost you there, can you start over on that point?",
|
| "Please repeat the steps.",
|
| "Sorry, can you go over that again from the beginning?",
|
| "What was the output of that code again?",
|
| "Can you re-run that example?",
|
| "I missed it, one more time please.",
|
| "I need to hear that explanation again.",
|
| "Can you walk me through that once more?",
|
| "Let me see that slide again.",
|
| "I need a recap of what you just said.",
|
| "Can you summarize what you just explained?",
|
| "What were the key points of that last section?",
|
| ]
|
|
|
|
|
|
|
|
|
|
|
|
|
| SYNONYM_MAP = {
|
| "explain": ["describe", "clarify", "elaborate on", "break down", "walk me through"],
|
| "show": ["demonstrate", "present", "display", "give me"],
|
| "help": ["assist", "support", "aid"],
|
| "use": ["utilize", "apply", "work with"],
|
| "understand": ["get", "grasp", "comprehend", "follow"],
|
| "repeat": ["say again", "go over again", "redo", "recap"],
|
| "confused": ["lost", "puzzled", "unsure", "baffled"],
|
| "stuck": ["blocked", "stalled", "unable to proceed"],
|
| "slow down": ["take it easy", "go slower", "ease up"],
|
| "speed up": ["go faster", "pick up the pace", "hurry up"],
|
| "example": ["demo", "sample", "illustration", "instance"],
|
| "error": ["bug", "mistake", "issue", "problem"],
|
| "different": ["alternative", "another", "other"],
|
| "code": ["program", "script", "snippet"],
|
| }
|
|
|
| FILLERS = ["umm", "so", "like", "hey", "well", "basically", "honestly", "actually", "ok so", "right"]
|
|
|
| def augment_synonym(text):
|
| """Replace one random word with a synonym."""
|
| for word, synonyms in SYNONYM_MAP.items():
|
| if word in text.lower() and random.random() < 0.35:
|
| pattern = re.compile(re.escape(word), re.IGNORECASE)
|
| text = pattern.sub(random.choice(synonyms), text, count=1)
|
| break
|
| return text
|
|
|
| def augment_case(text):
|
| """Randomly change casing."""
|
| r = random.random()
|
| if r < 0.3:
|
| return text.lower()
|
| if r < 0.38:
|
| return text.upper()
|
| return text
|
|
|
| def augment_punctuation(text):
|
| """Randomly alter punctuation."""
|
| r = random.random()
|
| if r < 0.25:
|
| return text.rstrip("?!.") + "?"
|
| if r < 0.4:
|
| return text.rstrip("?!.")
|
| if r < 0.48:
|
| return text.rstrip("?!.") + "!!"
|
| return text
|
|
|
| def augment_filler(text):
|
| """Randomly prepend a filler word."""
|
| if random.random() < 0.2:
|
| return random.choice(FILLERS) + " " + text
|
| return text
|
|
|
| def augment_typo(text, prob=0.08):
|
| """Inject character-level typos."""
|
| if random.random() > 0.35:
|
| return text
|
| chars = list(text)
|
| for i in range(len(chars)):
|
| if random.random() < prob and chars[i].isalpha():
|
| op = random.choice(["swap", "delete", "duplicate"])
|
| if op == "swap" and i < len(chars) - 1:
|
| chars[i], chars[i+1] = chars[i+1], chars[i]
|
| elif op == "delete":
|
| chars[i] = ""
|
| elif op == "duplicate":
|
| chars[i] = chars[i] * 2
|
| return "".join(chars)
|
|
|
| def augment_word_swap(text):
|
| """Swap two adjacent words."""
|
| words = text.split()
|
| if len(words) <= 2 or random.random() > 0.15:
|
| return text
|
| idx = random.randint(0, len(words) - 2)
|
| words[idx], words[idx+1] = words[idx+1], words[idx]
|
| return " ".join(words)
|
|
|
| def augment_word_delete(text):
|
| """Delete a random non-essential word."""
|
| words = text.split()
|
| if len(words) <= 3 or random.random() > 0.12:
|
| return text
|
| idx = random.randint(1, len(words) - 2)
|
| words.pop(idx)
|
| return " ".join(words)
|
|
|
| def augment_text(text):
|
| """Apply a random combination of augmentation strategies."""
|
| strategies = [augment_synonym, augment_case, augment_punctuation,
|
| augment_filler, augment_typo, augment_word_swap, augment_word_delete]
|
|
|
| chosen = random.sample(strategies, k=random.randint(1, 3))
|
| for fn in chosen:
|
| text = fn(text)
|
| return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| def get_on_topic_question(current_topic, prev_topics):
|
|
|
| if prev_topics and random.random() < 0.2:
|
| prev_topic = random.choice(prev_topics)
|
| template = random.choice(ON_TOPIC_CONTEXT_TEMPLATES)
|
| return template.replace("{topic}", current_topic).replace("{prev_topic}", prev_topic)
|
| template = random.choice(ON_TOPIC_TEMPLATES)
|
| return template.replace("{topic}", current_topic)
|
|
|
| def get_off_topic_question(current_topic_idx):
|
| if current_topic_idx < len(PYTHON_TOPICS) - 1 and random.random() < 0.5:
|
| future_topic = random.choice(PYTHON_TOPICS[current_topic_idx + 1:])
|
| template = random.choice(OFF_TOPIC_FUTURE_TOPIC_TEMPLATES)
|
| return template.replace("{topic}", future_topic)
|
| return random.choice(OFF_TOPIC_GENERAL)
|
|
|
| def get_emotional_state():
|
| return random.choice(EMOTIONAL_TEMPLATES)
|
|
|
| def get_pace_related():
|
| return random.choice(PACE_TEMPLATES)
|
|
|
| def get_repeat_clarification():
|
| return random.choice(REPEAT_TEMPLATES)
|
|
|
|
|
|
|
|
|
|
|
|
|
| def build_dataset(num_samples_per_class=2000, train_ratio=0.70, val_ratio=0.15, test_ratio=0.15):
|
| print(f"Starting Dataset Generation ({num_samples_per_class} per class)...")
|
|
|
| dataset = []
|
|
|
| for intent, label_id in LABEL_MAP.items():
|
| for _ in range(num_samples_per_class):
|
| topic_idx = random.randint(0, len(PYTHON_TOPICS) - 1)
|
| context_str, current_topic, prev_topics = generate_session_context(topic_idx)
|
|
|
| if intent == 'On-Topic Question':
|
| student_input = get_on_topic_question(current_topic, prev_topics)
|
| elif intent == 'Off-Topic Question':
|
| student_input = get_off_topic_question(topic_idx)
|
| elif intent == 'Emotional-State':
|
| student_input = get_emotional_state()
|
| elif intent == 'Pace-Related':
|
| student_input = get_pace_related()
|
| elif intent == 'Repeat/clarification':
|
| student_input = get_repeat_clarification()
|
| else:
|
| student_input = get_off_topic_question(topic_idx)
|
|
|
| student_input = augment_text(student_input)
|
|
|
| dataset.append({
|
| 'student_input': student_input,
|
| 'session_context': context_str,
|
| 'label': label_id,
|
| 'intent_name': intent
|
| })
|
|
|
| df = pd.DataFrame(dataset)
|
| df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
|
|
| train_dfs, val_dfs, test_dfs = [], [], []
|
| for label_id in sorted(df['label'].unique()):
|
| label_df = df[df['label'] == label_id].reset_index(drop=True)
|
| n = len(label_df)
|
| t1 = int(n * train_ratio)
|
| t2 = int(n * (train_ratio + val_ratio))
|
| train_dfs.append(label_df.iloc[:t1])
|
| val_dfs.append(label_df.iloc[t1:t2])
|
| test_dfs.append(label_df.iloc[t2:])
|
|
|
| train_df = pd.concat(train_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
|
| val_df = pd.concat(val_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
|
| test_df = pd.concat(test_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
| output_dir = 'data'
|
| os.makedirs(output_dir, exist_ok=True)
|
|
|
| train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
|
| val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
|
| test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
|
|
|
| print("[+] Data Generation Complete!")
|
| print(f"Total: {len(df)} | Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
|
| print(f"Train distribution:\n{train_df['label'].value_counts().sort_index().to_string()}")
|
|
|
|
|
| if __name__ == '__main__':
|
| build_dataset(num_samples_per_class=2000)
|
|
|