Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

175e3dd

verified ·

1 Parent(s): e1c75d3

Update processor.py

Browse files

Files changed (1) hide show

processor.py +20 -10

processor.py CHANGED Viewed

@@ -79,7 +79,7 @@ class DatasetCommandCenter:
     def _sanitize_for_json(self, obj):
         """
-        Ensures data is safe for JSON serialization to prevent UI crashes (NaN, NaT, Infinity).
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
@@ -133,6 +133,9 @@ class DatasetCommandCenter:
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
                 # Clean row for UI
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
@@ -179,8 +182,6 @@ class DatasetCommandCenter:
     def _get_value_by_path(self, obj, path):
         """
         Retrieves a value from the row.
-        PRIORITY 1: Exact Key Match (Simplest, safest path).
-        PRIORITY 2: Dot Notation Traversal (for nested JSON).
         """
         if not path: return obj
@@ -195,12 +196,17 @@ class DatasetCommandCenter:
         current = obj
         for i, key in enumerate(keys):
-            # Access key with duck-typing support (works on dicts, UserDicts, etc)
             try:
-                current = current[key]
             except:
-                return None # Key not found
             # Lazy Parsing: Only parse string if we need to go deeper
             is_last_key = (i == len(keys) - 1)
             if not is_last_key and isinstance(current, str):
@@ -209,7 +215,7 @@ class DatasetCommandCenter:
                     try:
                         current = json.loads(s)
                     except:
-                        return None # Broken JSON
         return current
@@ -230,6 +236,7 @@ class DatasetCommandCenter:
         matched_item = None
         for item in data:
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
@@ -242,7 +249,7 @@ class DatasetCommandCenter:
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # Eval Context
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
@@ -334,9 +341,10 @@ The following operations were applied to the source data:
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
                 if max_rows and count >= int(max_rows):
                     break
                 row = dict(row)
                 # 1. Filter
@@ -391,8 +399,10 @@ The following operations were applied to the source data:
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
-                row = dict(row)
                 # Check Filter
                 passed = True
                 if recipe.get('filter_rule'):

     def _sanitize_for_json(self, obj):
         """
+        Recursively cleans data for JSON serialization.
         """
         if isinstance(obj, float):
             if math.isnan(obj) or math.isinf(obj):
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # CRITICAL FIX: Force Materialization
+                row = dict(row)
                 # Clean row for UI
                 clean_row = self._sanitize_for_json(row)
                 sample_rows.append(clean_row)
     def _get_value_by_path(self, obj, path):
         """
         Retrieves a value from the row.
         """
         if not path: return obj
         current = obj
         for i, key in enumerate(keys):
             try:
+                # Use get() if possible, or key access
+                if isinstance(current, dict):
+                    current = current.get(key)
+                else:
+                    return None
             except:
+                return None
+            if current is None: return None
             # Lazy Parsing: Only parse string if we need to go deeper
             is_last_key = (i == len(keys) - 1)
             if not is_last_key and isinstance(current, str):
                     try:
                         current = json.loads(s)
                     except:
+                        return None
         return current
         matched_item = None
         for item in data:
+            # String comparison for safety
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
     def _apply_projection(self, row, recipe):
         new_row = {}
+        # Eval Context (requires explicit dict)
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
                 if max_rows and count >= int(max_rows):
                     break
+                # CRITICAL FIX: Force Materialization
                 row = dict(row)
                 # 1. Filter
             for i, row in enumerate(ds_stream):
                 if len(processed) >= 5: break
+                # CRITICAL FIX: Force Materialization
+                row = dict(row)
                 # Check Filter
                 passed = True
                 if recipe.get('filter_rule'):