Spaces:
Sleeping
Sleeping
Update processor.py
Browse files- processor.py +20 -10
processor.py
CHANGED
|
@@ -79,7 +79,7 @@ class DatasetCommandCenter:
|
|
| 79 |
|
| 80 |
def _sanitize_for_json(self, obj):
|
| 81 |
"""
|
| 82 |
-
|
| 83 |
"""
|
| 84 |
if isinstance(obj, float):
|
| 85 |
if math.isnan(obj) or math.isinf(obj):
|
|
@@ -133,6 +133,9 @@ class DatasetCommandCenter:
|
|
| 133 |
for i, row in enumerate(ds_stream):
|
| 134 |
if i >= 10: break
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
# Clean row for UI
|
| 137 |
clean_row = self._sanitize_for_json(row)
|
| 138 |
sample_rows.append(clean_row)
|
|
@@ -179,8 +182,6 @@ class DatasetCommandCenter:
|
|
| 179 |
def _get_value_by_path(self, obj, path):
|
| 180 |
"""
|
| 181 |
Retrieves a value from the row.
|
| 182 |
-
PRIORITY 1: Exact Key Match (Simplest, safest path).
|
| 183 |
-
PRIORITY 2: Dot Notation Traversal (for nested JSON).
|
| 184 |
"""
|
| 185 |
if not path: return obj
|
| 186 |
|
|
@@ -195,12 +196,17 @@ class DatasetCommandCenter:
|
|
| 195 |
current = obj
|
| 196 |
|
| 197 |
for i, key in enumerate(keys):
|
| 198 |
-
# Access key with duck-typing support (works on dicts, UserDicts, etc)
|
| 199 |
try:
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
except:
|
| 202 |
-
return None
|
| 203 |
|
|
|
|
|
|
|
| 204 |
# Lazy Parsing: Only parse string if we need to go deeper
|
| 205 |
is_last_key = (i == len(keys) - 1)
|
| 206 |
if not is_last_key and isinstance(current, str):
|
|
@@ -209,7 +215,7 @@ class DatasetCommandCenter:
|
|
| 209 |
try:
|
| 210 |
current = json.loads(s)
|
| 211 |
except:
|
| 212 |
-
return None
|
| 213 |
|
| 214 |
return current
|
| 215 |
|
|
@@ -230,6 +236,7 @@ class DatasetCommandCenter:
|
|
| 230 |
|
| 231 |
matched_item = None
|
| 232 |
for item in data:
|
|
|
|
| 233 |
if str(item.get(filter_key, '')) == str(filter_val):
|
| 234 |
matched_item = item
|
| 235 |
break
|
|
@@ -242,7 +249,7 @@ class DatasetCommandCenter:
|
|
| 242 |
def _apply_projection(self, row, recipe):
|
| 243 |
new_row = {}
|
| 244 |
|
| 245 |
-
# Eval Context
|
| 246 |
eval_context = row.copy()
|
| 247 |
eval_context['row'] = row
|
| 248 |
eval_context['json'] = json
|
|
@@ -334,9 +341,10 @@ The following operations were applied to the source data:
|
|
| 334 |
ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
|
| 335 |
count = 0
|
| 336 |
for i, row in enumerate(ds_stream):
|
| 337 |
-
|
| 338 |
if max_rows and count >= int(max_rows):
|
| 339 |
break
|
|
|
|
|
|
|
| 340 |
row = dict(row)
|
| 341 |
|
| 342 |
# 1. Filter
|
|
@@ -391,8 +399,10 @@ The following operations were applied to the source data:
|
|
| 391 |
|
| 392 |
for i, row in enumerate(ds_stream):
|
| 393 |
if len(processed) >= 5: break
|
| 394 |
-
row = dict(row)
|
| 395 |
|
|
|
|
|
|
|
|
|
|
| 396 |
# Check Filter
|
| 397 |
passed = True
|
| 398 |
if recipe.get('filter_rule'):
|
|
|
|
| 79 |
|
| 80 |
def _sanitize_for_json(self, obj):
|
| 81 |
"""
|
| 82 |
+
Recursively cleans data for JSON serialization.
|
| 83 |
"""
|
| 84 |
if isinstance(obj, float):
|
| 85 |
if math.isnan(obj) or math.isinf(obj):
|
|
|
|
| 133 |
for i, row in enumerate(ds_stream):
|
| 134 |
if i >= 10: break
|
| 135 |
|
| 136 |
+
# CRITICAL FIX: Force Materialization
|
| 137 |
+
row = dict(row)
|
| 138 |
+
|
| 139 |
# Clean row for UI
|
| 140 |
clean_row = self._sanitize_for_json(row)
|
| 141 |
sample_rows.append(clean_row)
|
|
|
|
| 182 |
def _get_value_by_path(self, obj, path):
|
| 183 |
"""
|
| 184 |
Retrieves a value from the row.
|
|
|
|
|
|
|
| 185 |
"""
|
| 186 |
if not path: return obj
|
| 187 |
|
|
|
|
| 196 |
current = obj
|
| 197 |
|
| 198 |
for i, key in enumerate(keys):
|
|
|
|
| 199 |
try:
|
| 200 |
+
# Use get() if possible, or key access
|
| 201 |
+
if isinstance(current, dict):
|
| 202 |
+
current = current.get(key)
|
| 203 |
+
else:
|
| 204 |
+
return None
|
| 205 |
except:
|
| 206 |
+
return None
|
| 207 |
|
| 208 |
+
if current is None: return None
|
| 209 |
+
|
| 210 |
# Lazy Parsing: Only parse string if we need to go deeper
|
| 211 |
is_last_key = (i == len(keys) - 1)
|
| 212 |
if not is_last_key and isinstance(current, str):
|
|
|
|
| 215 |
try:
|
| 216 |
current = json.loads(s)
|
| 217 |
except:
|
| 218 |
+
return None
|
| 219 |
|
| 220 |
return current
|
| 221 |
|
|
|
|
| 236 |
|
| 237 |
matched_item = None
|
| 238 |
for item in data:
|
| 239 |
+
# String comparison for safety
|
| 240 |
if str(item.get(filter_key, '')) == str(filter_val):
|
| 241 |
matched_item = item
|
| 242 |
break
|
|
|
|
| 249 |
def _apply_projection(self, row, recipe):
|
| 250 |
new_row = {}
|
| 251 |
|
| 252 |
+
# Eval Context (requires explicit dict)
|
| 253 |
eval_context = row.copy()
|
| 254 |
eval_context['row'] = row
|
| 255 |
eval_context['json'] = json
|
|
|
|
| 341 |
ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
|
| 342 |
count = 0
|
| 343 |
for i, row in enumerate(ds_stream):
|
|
|
|
| 344 |
if max_rows and count >= int(max_rows):
|
| 345 |
break
|
| 346 |
+
|
| 347 |
+
# CRITICAL FIX: Force Materialization
|
| 348 |
row = dict(row)
|
| 349 |
|
| 350 |
# 1. Filter
|
|
|
|
| 399 |
|
| 400 |
for i, row in enumerate(ds_stream):
|
| 401 |
if len(processed) >= 5: break
|
|
|
|
| 402 |
|
| 403 |
+
# CRITICAL FIX: Force Materialization
|
| 404 |
+
row = dict(row)
|
| 405 |
+
|
| 406 |
# Check Filter
|
| 407 |
passed = True
|
| 408 |
if recipe.get('filter_rule'):
|