mikasenghaas commited on
Commit
2a902a6
·
unverified ·
1 Parent(s): 2fd24d8

Add test scripts

Browse files
Files changed (2) hide show
  1. test_template.py +311 -0
  2. test_tokenization.py +26 -0
test_template.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = ["transformers", "jinja2"]
4
+ # ///
5
+
6
+
7
+ from transformers import AutoTokenizer
8
+
9
+
10
+ def print_section(title, messages, tokenizers, **tokenizer_kwargs):
11
+ """Helper function to print formatted sections"""
12
+ print(f"\n{'=' * 60}")
13
+ print(f"{title}")
14
+ print(f"{'=' * 60}")
15
+ print(f"\n{messages=}\n")
16
+ for tokenizer_name, tokenizer in tokenizers.items():
17
+ print(f"\n{tokenizer_name=}\n")
18
+ content = tokenizer.apply_chat_template(
19
+ messages, tokenize=False, **tokenizer_kwargs
20
+ )
21
+ print(content)
22
+
23
+
24
+ # Initialize tokenizer
25
+ local_tokenizer = AutoTokenizer.from_pretrained(".")
26
+ qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Coder-30B-A3B-Instruct")
27
+ tokenizers = {"Local": local_tokenizer, "Qwen3-Coder": qwen3_tokenizer}
28
+
29
+ # Only user message
30
+ print_section(
31
+ "User message only",
32
+ [{"role": "user", "content": "What is the capital of France?"}],
33
+ tokenizers,
34
+ )
35
+
36
+ # User message with generation prompt
37
+ print_section(
38
+ "User message with generation prompt",
39
+ [{"role": "user", "content": "What is the capital of France?"}],
40
+ tokenizers,
41
+ add_generation_prompt=True,
42
+ )
43
+
44
+ # User message with custom system message
45
+ print_section(
46
+ "Custom system message",
47
+ [
48
+ {"role": "system", "content": "You are a helpful assistant."},
49
+ {"role": "user", "content": "What is the capital of France?"},
50
+ ],
51
+ tokenizers,
52
+ )
53
+
54
+ # Single-turn with assistant response (no think)
55
+ print_section(
56
+ "Single-turn with assistant response (no think)",
57
+ [
58
+ {"role": "user", "content": "What is the capital of France?"},
59
+ {"role": "assistant", "content": "The capital of France is Paris."},
60
+ ],
61
+ tokenizers,
62
+ )
63
+
64
+ # Single-turn with think embedded in content
65
+ print_section(
66
+ "Single-turn with think embedded in content",
67
+ [
68
+ {"role": "user", "content": "What is the capital of France?"},
69
+ {
70
+ "role": "assistant",
71
+ "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
72
+ },
73
+ ],
74
+ tokenizers,
75
+ )
76
+
77
+ # Single-turn with reasoning_content field
78
+ print_section(
79
+ "Single-turn with reasoning_content field",
80
+ [
81
+ {"role": "user", "content": "What is the capital of France?"},
82
+ {
83
+ "role": "assistant",
84
+ "content": "The capital of France is Paris.",
85
+ "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
86
+ },
87
+ ],
88
+ tokenizers,
89
+ )
90
+
91
+ print_section(
92
+ "Single-turn with think section and reasoning_content field",
93
+ [
94
+ {"role": "user", "content": "What is the capital of France?"},
95
+ {
96
+ "role": "assistant",
97
+ "content": "<think>The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.</think>\nThe capital of France is Paris.",
98
+ "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris. This is a straightforward factual question.",
99
+ },
100
+ ],
101
+ tokenizers,
102
+ )
103
+
104
+
105
+ # Multi-turn and assistant response with think sections (embedded in content)
106
+ print_section(
107
+ "Multi-turn with think embedded in content",
108
+ [
109
+ {"role": "user", "content": "What is the capital of France?"},
110
+ {
111
+ "role": "assistant",
112
+ "content": "<think>This is a basic geography question.</think>\nThe capital of France is Paris.",
113
+ },
114
+ {"role": "user", "content": "What about Germany?"},
115
+ {
116
+ "role": "assistant",
117
+ "content": "<think>Another geography question. Germany's capital is Berlin.</think>\nThe capital of Germany is Berlin.",
118
+ },
119
+ ],
120
+ tokenizers,
121
+ )
122
+
123
+ # Multi-turn and assistant response with think sections (embedded in content)
124
+ print_section(
125
+ "Multi-turn with reasoning_content field",
126
+ [
127
+ {"role": "user", "content": "What is the capital of France?"},
128
+ {
129
+ "role": "assistant",
130
+ "reasoning_content": "The user is asking about geography. France is a country in Europe, and its capital city is Paris.",
131
+ "content": "The capital of France is Paris.",
132
+ },
133
+ {"role": "user", "content": "What about Germany?"},
134
+ {
135
+ "role": "assistant",
136
+ "reasoning_content": "Another geography question. Germany's capital is Berlin.",
137
+ "content": "The capital of Germany is Berlin.",
138
+ },
139
+ ],
140
+ tokenizers,
141
+ )
142
+
143
+ # Assistant with only think section, no visible content
144
+ print_section(
145
+ "Assistant with only think section",
146
+ [
147
+ {
148
+ "role": "user",
149
+ "content": "Think about this problem but don't respond yet.",
150
+ },
151
+ {
152
+ "role": "assistant",
153
+ "content": "<think>The user wants me to think about something but not provide a response yet. I should just show my thinking process without any visible output.</think>",
154
+ },
155
+ ],
156
+ tokenizers,
157
+ )
158
+
159
+ # Assistant with unfinished think section
160
+ print_section(
161
+ "Assistant with unfinished think section",
162
+ [
163
+ {
164
+ "role": "user",
165
+ "content": "Think about this problem but don't respond yet.",
166
+ },
167
+ {
168
+ "role": "assistant",
169
+ "content": "<think>The user wants me to think about something but not provide a response yet. I should just",
170
+ },
171
+ ],
172
+ tokenizers,
173
+ )
174
+
175
+ print_section(
176
+ "Empty think content",
177
+ [
178
+ {"role": "user", "content": "Say hello"},
179
+ {"role": "assistant", "content": "<think></think>Hello! How can I help you today?"},
180
+ ],
181
+ tokenizers,
182
+ )
183
+
184
+ print_section(
185
+ "Empty reasoning content",
186
+ [
187
+ {"role": "system", "content": "You are a helpful assistant."},
188
+ {"role": "user", "content": "Say hello"},
189
+ {
190
+ "role": "assistant",
191
+ "content": "Hello! How can I help you today?",
192
+ "reasoning_content": "",
193
+ },
194
+ ],
195
+ tokenizers,
196
+ )
197
+
198
+
199
+ # ============================================================================
200
+ # EXAMPLE 7: Tool use scenario
201
+ # ============================================================================
202
+ tool_example = [
203
+ {"role": "user", "content": "What's the weather like in Paris?"},
204
+ {
205
+ "role": "assistant",
206
+ "content": "I'll check the weather in Paris for you.",
207
+ "reasoning_content": "I should use the get_weather tool for this.",
208
+ "tool_calls": [
209
+ {
210
+ "name": "get_weather",
211
+ "arguments": {"location": "Paris, France", "units": "celsius"},
212
+ }
213
+ ],
214
+ },
215
+ {
216
+ "role": "tool",
217
+ "content": "Current weather in Paris: 18°C, partly cloudy with light winds.",
218
+ },
219
+ {
220
+ "role": "assistant",
221
+ "content": "<think>The weather API returned current conditions for Paris. I should provide this information to the user in a clear format.</think>\nThe current weather in Paris is 18°C with partly cloudy skies and light winds. It's a pleasant day!",
222
+ },
223
+ ]
224
+
225
+ # Define tools for this example
226
+ tools = [
227
+ {
228
+ "type": "function",
229
+ "function": {
230
+ "name": "get_weather",
231
+ "description": "Get current weather information for a location",
232
+ "parameters": {
233
+ "type": "object",
234
+ "properties": {
235
+ "location": {
236
+ "type": "string",
237
+ "description": "The city and country",
238
+ },
239
+ "units": {"type": "string", "enum": ["celsius", "fahrenheit"]},
240
+ },
241
+ "required": ["location"],
242
+ },
243
+ },
244
+ }
245
+ ]
246
+
247
+ print_section(
248
+ "Single-turn tool use with weather",
249
+ tool_example,
250
+ tokenizers,
251
+ tools=tools,
252
+ )
253
+
254
+ # ============================================================================
255
+ # EXAMPLE 8: Multiple tool calls in one response
256
+ # ============================================================================
257
+ multi_tool_example = [
258
+ {
259
+ "role": "user",
260
+ "content": "I need to calculate 15 * 23 and also get the current time.",
261
+ },
262
+ {
263
+ "role": "assistant",
264
+ "content": "<think>The user wants two things: a calculation and the current time. I'll use two tools to get this information.</think>\nI'll help you with both the calculation and getting the current time.",
265
+ "tool_calls": [
266
+ {"name": "calculate", "arguments": {"expression": "15 * 23"}},
267
+ {"name": "get_current_time", "arguments": {}},
268
+ ],
269
+ },
270
+ {"role": "tool", "content": "345"},
271
+ {"role": "tool", "content": "2024-01-15T14:30:22Z"},
272
+ {
273
+ "role": "assistant",
274
+ "content": "Perfect! Here are your results:\n- 15 × 23 = 345\n- Current time: 2:30 PM UTC on January 15, 2024",
275
+ },
276
+ ]
277
+
278
+ multi_tools = [
279
+ {
280
+ "type": "function",
281
+ "function": {
282
+ "name": "calculate",
283
+ "description": "Perform mathematical calculations",
284
+ "parameters": {
285
+ "type": "object",
286
+ "properties": {
287
+ "expression": {
288
+ "type": "string",
289
+ "description": "Mathematical expression to evaluate",
290
+ }
291
+ },
292
+ "required": ["expression"],
293
+ },
294
+ },
295
+ },
296
+ {
297
+ "type": "function",
298
+ "function": {
299
+ "name": "get_current_time",
300
+ "description": "Get the current date and time",
301
+ "parameters": {"type": "object", "properties": {}},
302
+ },
303
+ },
304
+ ]
305
+
306
+ print_section(
307
+ "Single-turn with multiple tool calls",
308
+ multi_tool_example,
309
+ tokenizers,
310
+ tools=multi_tools,
311
+ )
test_tokenization.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = ["transformers", "jinja2"]
4
+ # ///
5
+
6
+
7
+ from transformers import AutoTokenizer
8
+
9
+ # Initialize tokenizer
10
+ local_tokenizer = AutoTokenizer.from_pretrained(".")
11
+ qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Coder-30B-A3B-Instruct")
12
+
13
+
14
+ # User message with custom system message
15
+ messages = [
16
+ {"role": "system", "content": "You are a helpful assistant."},
17
+ {"role": "user", "content": "What is the capital of France?"},
18
+ ]
19
+
20
+ print("Local")
21
+ print(local_tokenizer.apply_chat_template(messages, tokenize=False))
22
+ print(local_tokenizer.apply_chat_template(messages, tokenize=True))
23
+
24
+ print("\n\nQwen3-Coder")
25
+ print(qwen3_tokenizer.apply_chat_template(messages, tokenize=False))
26
+ print(qwen3_tokenizer.apply_chat_template(messages, tokenize=True))