Spaces:

broadfield-dev
/

HF-Dataset-Commander

Sleeping

App Files Files Community

broadfield-dev commited on Dec 30, 2025

Commit

2fcce3e

verified ·

1 Parent(s): a738515

Update processor.py

Browse files

Files changed (1) hide show

processor.py +194 -103

processor.py CHANGED Viewed

@@ -14,49 +14,61 @@ class DatasetCommandCenter:
         self.token = token
         self.api = HfApi(token=token)
-    # --- 1. METADATA & INSPECTION ---
     def get_dataset_metadata(self, dataset_id):
-        configs = []
-        splits = []
         license_name = "unknown"
-        # 1. Get Configs
         try:
-            configs = get_dataset_config_names(dataset_id, token=self.token)
-        except Exception as e:
-            logger.warning(f"Could not fetch configs: {e}")
-            configs = ['default']
-        # 2. Get Splits & License
-        try:
-            selected_config = configs[0] if configs else 'default'
-            infos = get_dataset_infos(dataset_id, token=self.token)
-            info_obj = None
-            if selected_config in infos:
-                info_obj = infos[selected_config]
-            elif 'default' in infos:
-                info_obj = infos['default']
-            elif len(infos) > 0:
-                info_obj = list(infos.values())[0]
-            if info_obj:
-                splits = list(info_obj.splits.keys())
-                license_name = info_obj.license or "unknown"
-        except Exception as e:
-            logger.warning(f"Metadata fetch fallback: {e}")
-            splits = ['train', 'test', 'validation']
-        return {
-            "status": "success",
-            "configs": configs if configs else ['default'],
-            "splits": splits if splits else ['train'],
-            "license_detected": license_name
-        }
     def get_splits_for_config(self, dataset_id, config_name):
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
@@ -65,112 +77,165 @@ class DatasetCommandCenter:
                 splits = list(infos.values())[0].splits.keys()
             else:
                 splits = ['train', 'test']
         except:
-            splits = ['train', 'test', 'validation']
-        return {"status": "success", "splits": splits}
     def inspect_dataset(self, dataset_id, config, split):
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
-            schema_map = {}
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
-                # Create clean sample for UI
                 clean_row = {}
                 for k, v in row.items():
-                    # Convert objects to strings for display safety
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
                         clean_row[k] = str(v)
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
-                # Analyze Schema
                 for k, v in row.items():
-                    if k not in schema_map:
-                        schema_map[k] = {"is_list": False, "keys": set()}
                     val = v
-                    # Check for JSON string
                     if isinstance(val, str):
-                        try:
-                            val = json.loads(val)
                         except: pass
-                    if isinstance(val, list):
-                        schema_map[k]["is_list"] = True
-                        if len(val) > 0 and isinstance(val[0], dict):
-                            schema_map[k]["keys"].update(val[0].keys())
-                    elif isinstance(val, dict):
-                        schema_map[k]["keys"].update(val.keys())
-            formatted_schema = {}
-            for k, info in schema_map.items():
-                formatted_schema[k] = {
-                    "type": "List" if info["is_list"] else "Object",
-                    "keys": list(info["keys"])
-                }
             return {
                 "status": "success",
                 "samples": sample_rows,
-                "schema": formatted_schema,
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
-    # --- 2. EXTRACTION LOGIC ---
     def _get_value_by_path(self, obj, path):
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
-                    except: pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
             else:
-                return None
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
         data = row.get(source_col)
         if isinstance(data, str):
             try:
                 data = json.loads(data)
-            except: return None
         if not isinstance(data, list):
             return None
         matched_item = None
         for item in data:
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
         if matched_item:
             return self._get_value_by_path(matched_item, target_path)
         return None
     def _apply_projection(self, row, recipe):
         new_row = {}
-        # Setup Context for Python/Eval
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
@@ -194,21 +259,25 @@ class DatasetCommandCenter:
                     )
                 elif t_type == 'python':
                     expression = col_def['expression']
                     val = eval(expression, {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
-                # Fail Fast: Raise error to stop the generator
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
-    # --- 3. DOCUMENTATION (CARD) ---
     def _generate_card(self, source_id, target_id, recipe, license_name):
-        logger.info(f"Generating card for {target_id} with license {license_name}")
         card_data = DatasetCardData(
             language="en",
             license=license_name,
@@ -226,20 +295,22 @@ It was generated using the **Hugging Face Dataset Command Center**.
 The following operations were applied to the source data:
-| Target Column | Source | Type | Logic / Filter |
-|---------------|--------|------|----------------|
 """
         for col in recipe['columns']:
             c_type = col.get('type', 'simple')
             c_name = col['name']
-            c_src = col.get('source', '-')
             logic = "-"
-            if c_type == 'simple': logic = "Direct Mapping"
-            elif c_type == 'list_search': logic = f"Get `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
-            elif c_type == 'python': logic = f"`{col.get('expression')}`"
-            content += f"| **{c_name}** | `{c_src}` | {c_type} | {logic} |\n"
         if recipe.get('filter_rule'):
             content += f"\n### Row Filtering\n**Filter Applied:** `{recipe['filter_rule']}`\n"
@@ -249,7 +320,9 @@ The following operations were applied to the source data:
         card = DatasetCard.from_template(card_data, content=content)
         return card
-    # --- 4. EXECUTION ---
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
         logger.info(f"Job started: {source_id} -> {target_id}")
@@ -259,39 +332,42 @@ The following operations were applied to the source data:
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
-                if max_rows and count >= int(max_rows): break
-                # Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
                         if not eval(recipe['filter_rule'], {}, ctx):
                             continue
                     except Exception as e:
                         raise ValueError(f"Filter crashed on row {i}: {e}")
-                # Projection
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
                 except ValueError as ve:
                     raise ve
                 except Exception as e:
-                    raise ValueError(f"Crash on row {i}: {e}")
         try:
-            # 1. Push Data
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
-            # 2. Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
             except Exception as e:
                 logger.error(f"Failed to push Dataset Card: {e}")
-                # We do NOT fail the whole job, but we log it.
             return {"status": "success", "rows_processed": len(new_dataset)}
@@ -299,27 +375,42 @@ The following operations were applied to the source data:
             logger.error(f"Job Failed: {e}")
             return {"status": "failed", "error": str(e)}
-    # --- 5. PREVIEW ---
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
-        ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
-        processed = []
-        for row in ds_stream:
-            if len(processed) >= 5: break
-            # Filter
-            passed = True
-            if recipe.get('filter_rule'):
-                try:
-                    ctx = row.copy()
-                    ctx['row'] = row
-                    if not eval(recipe['filter_rule'], {}, ctx): passed = False
-                except: passed = False
-            if passed:
-                try:
-                    processed.append(self._apply_projection(row, recipe))
-                except Exception as e:
-                    processed.append({"error": str(e)}) # Show error in preview
-        return processed

         self.token = token
         self.api = HfApi(token=token)
+    # ==========================================
+    # 1. METADATA & INSPECTION
+    # ==========================================
     def get_dataset_metadata(self, dataset_id):
+        """
+        Fetches available Configs (subsets), Splits, and License info
+        without downloading the actual data rows.
+        """
+        configs = ['default']
+        splits = ['train', 'test', 'validation']
         license_name = "unknown"
         try:
+            # 1. Fetch Configs
+            try:
+                found_configs = get_dataset_config_names(dataset_id, token=self.token)
+                if found_configs:
+                    configs = found_configs
+            except Exception:
+                pass # Keep default
+            # 2. Fetch Metadata (Splits & License)
+            try:
+                selected = configs[0]
+                # This API call can fail on some datasets, so we wrap it safely
+                infos = get_dataset_infos(dataset_id, token=self.token)
+                info = None
+                if selected in infos:
+                    info = infos[selected]
+                elif 'default' in infos:
+                    info = infos['default']
+                elif infos:
+                    info = list(infos.values())[0]
+                if info:
+                    splits = list(info.splits.keys())
+                    license_name = info.license or "unknown"
+            except Exception:
+                pass # Keep defaults if metadata fails
+            return {
+                "status": "success",
+                "configs": configs,
+                "splits": splits,
+                "license_detected": license_name
+            }
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
     def get_splits_for_config(self, dataset_id, config_name):
+        """
+        Updates the Split dropdown when the user changes the Config.
+        """
         try:
             infos = get_dataset_infos(dataset_id, config_name=config_name, token=self.token)
             if config_name in infos:
                 splits = list(infos.values())[0].splits.keys()
             else:
                 splits = ['train', 'test']
+            return {"status": "success", "splits": splits}
         except:
+            return {"status": "success", "splits": ['train', 'test', 'validation']}
+    def _flatten_object(self, obj, parent_key='', sep='.'):
+        """
+        Recursively finds all keys in nested dicts or JSON strings
+        to populate the 'Simple Path' dropdown in the UI.
+        """
+        items = {}
+        # Transparently parse JSON strings
+        if isinstance(obj, str):
+            s = obj.strip()
+            if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
+                try:
+                    obj = json.loads(s)
+                except:
+                    pass # Keep as string if parse fails
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                new_key = f"{parent_key}{sep}{k}" if parent_key else k
+                items.update(self._flatten_object(v, new_key, sep=sep))
+        elif isinstance(obj, list):
+            # We mark lists but do not recurse infinitely
+            new_key = f"{parent_key}" if parent_key else "list_content"
+            items[new_key] = "List"
+        else:
+            # Leaf node
+            items[parent_key] = type(obj).__name__
+        return items
     def inspect_dataset(self, dataset_id, config, split):
+        """
+        Scans the first 10 rows to build a Schema Tree for the UI.
+        """
         try:
             conf = config if config != 'default' else None
             ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
             sample_rows = []
+            available_paths = set()
+            schema_map = {} # Used for List Mode detection
             for i, row in enumerate(ds_stream):
                 if i >= 10: break
+                # 1. Clean row for UI Preview (convert objects to strings)
                 clean_row = {}
                 for k, v in row.items():
                     if not isinstance(v, (str, int, float, bool, list, dict, type(None))):
                         clean_row[k] = str(v)
                     else:
                         clean_row[k] = v
                 sample_rows.append(clean_row)
+                # 2. Deep Flattening for "Simple Path" dropdowns
+                flattened = self._flatten_object(row)
+                available_paths.update(flattened.keys())
+                # 3. Top Level Analysis for "List Mode" detection
                 for k, v in row.items():
+                    if k not in schema_map:
+                        schema_map[k] = {"type": "Object"}
                     val = v
                     if isinstance(val, str):
+                        try: val = json.loads(val)
                         except: pass
+                    if isinstance(val, list):
+                        schema_map[k]["type"] = "List"
+            # Reconstruct Schema Tree for UI grouping
+            sorted_paths = sorted(list(available_paths))
+            schema_tree = {}
+            for path in sorted_paths:
+                root = path.split('.')[0]
+                if root not in schema_tree:
+                    schema_tree[root] = []
+                schema_tree[root].append(path)
             return {
                 "status": "success",
                 "samples": sample_rows,
+                "schema_tree": schema_tree, # Used by Simple Path Dropdown
+                "schema": schema_map,       # Used by List Mode Dropdown
                 "dataset_id": dataset_id
             }
         except Exception as e:
             return {"status": "error", "message": str(e)}
+    # ==========================================
+    # 2. CORE EXTRACTION LOGIC
+    # ==========================================
     def _get_value_by_path(self, obj, path):
+        """
+        Navigates dot notation (meta.user.id), automatically parsing
+        JSON strings if encountered along the path.
+        """
         if not path: return obj
         keys = path.split('.')
         current = obj
         for key in keys:
+            # Auto-parse JSON string if encountered
             if isinstance(current, str):
                 s = current.strip()
                 if (s.startswith('{') and s.endswith('}')) or (s.startswith('[') and s.endswith(']')):
                     try:
                         current = json.loads(s)
+                    except:
+                        pass
             if isinstance(current, dict) and key in current:
                 current = current[key]
             else:
+                return None # Path broken
         return current
     def _extract_from_list_logic(self, row, source_col, filter_key, filter_val, target_path):
+        """
+        Logic for: FROM source_col FIND ITEM WHERE filter_key == filter_val EXTRACT target_path
+        """
         data = row.get(source_col)
+        # Parse if string
         if isinstance(data, str):
             try:
                 data = json.loads(data)
+            except:
+                return None
         if not isinstance(data, list):
             return None
         matched_item = None
         for item in data:
+            # String comparison for safety
             if str(item.get(filter_key, '')) == str(filter_val):
                 matched_item = item
                 break
         if matched_item:
             return self._get_value_by_path(matched_item, target_path)
         return None
     def _apply_projection(self, row, recipe):
+        """
+        Builds the new row based on the recipe.
+        Raises ValueError if user Python code fails (Fail Fast).
+        """
         new_row = {}
+        # Setup Eval Context (Variables available in Python Mode)
         eval_context = row.copy()
         eval_context['row'] = row
         eval_context['json'] = json
                     )
                 elif t_type == 'python':
+                    # Execute user code
                     expression = col_def['expression']
                     val = eval(expression, {}, eval_context)
                     new_row[target_col] = val
             except Exception as e:
+                # Fail Fast: Stop the generator immediately if a column fails
                 raise ValueError(f"Column '{target_col}' failed: {str(e)}")
         return new_row
+    # ==========================================
+    # 3. DOCUMENTATION (MODEL CARD)
+    # ==========================================
     def _generate_card(self, source_id, target_id, recipe, license_name):
+        """
+        Creates a README.md for the new dataset.
+        """
         card_data = DatasetCardData(
             language="en",
             license=license_name,
 The following operations were applied to the source data:
+| Target Column | Operation Type | Logic |
+|---------------|----------------|-------|
 """
         for col in recipe['columns']:
             c_type = col.get('type', 'simple')
             c_name = col['name']
             logic = "-"
+            if c_type == 'simple':
+                logic = f"Mapped from `{col.get('source')}`"
+            elif c_type == 'list_search':
+                logic = f"Extracted `{col['target_key']}` where `{col['filter_key']} == {col['filter_val']}`"
+            elif c_type == 'python':
+                logic = f"Python: `{col.get('expression')}`"
+            content += f"| **{c_name}** | {c_type} | {logic} |\n"
         if recipe.get('filter_rule'):
             content += f"\n### Row Filtering\n**Filter Applied:** `{recipe['filter_rule']}`\n"
         card = DatasetCard.from_template(card_data, content=content)
         return card
+    # ==========================================
+    # 4. EXECUTION
+    # ==========================================
     def process_and_push(self, source_id, config, split, target_id, recipe, max_rows=None, new_license=None):
         logger.info(f"Job started: {source_id} -> {target_id}")
             ds_stream = load_dataset(source_id, name=conf, split=split, streaming=True, token=self.token)
             count = 0
             for i, row in enumerate(ds_stream):
+                if max_rows and count >= int(max_rows):
+                    break
+                # 1. Filter
                 if recipe.get('filter_rule'):
                     try:
                         ctx = row.copy()
                         ctx['row'] = row
+                        ctx['json'] = json
+                        ctx['re'] = re
                         if not eval(recipe['filter_rule'], {}, ctx):
                             continue
                     except Exception as e:
                         raise ValueError(f"Filter crashed on row {i}: {e}")
+                # 2. Projection
                 try:
                     yield self._apply_projection(row, recipe)
                     count += 1
                 except ValueError as ve:
+                    # Pass the specific column error up
                     raise ve
                 except Exception as e:
+                    raise ValueError(f"Unexpected crash on row {i}: {e}")
         try:
+            # 1. Process & Push Data
             new_dataset = datasets.Dataset.from_generator(gen)
             new_dataset.push_to_hub(target_id, token=self.token)
+            # 2. Generate & Push Card
             try:
                 card = self._generate_card(source_id, target_id, recipe, new_license or "unknown")
                 card.push_to_hub(target_id, token=self.token)
             except Exception as e:
                 logger.error(f"Failed to push Dataset Card: {e}")
             return {"status": "success", "rows_processed": len(new_dataset)}
             logger.error(f"Job Failed: {e}")
             return {"status": "failed", "error": str(e)}
+    # ==========================================
+    # 5. PREVIEW
+    # ==========================================
     def preview_transform(self, dataset_id, config, split, recipe):
         conf = config if config != 'default' else None
+        try:
+            ds_stream = load_dataset(dataset_id, name=conf, split=split, streaming=True, token=self.token)
+            processed = []
+            for i, row in enumerate(ds_stream):
+                if len(processed) >= 5: break
+                # Check Filter
+                passed = True
+                if recipe.get('filter_rule'):
+                    try:
+                        ctx = row.copy()
+                        ctx['row'] = row
+                        ctx['json'] = json
+                        ctx['re'] = re
+                        if not eval(recipe['filter_rule'], {}, ctx):
+                            passed = False
+                    except:
+                        passed = False # Skip invalid rows in preview
+                if passed:
+                    try:
+                        new_row = self._apply_projection(row, recipe)
+                        processed.append(new_row)
+                    except Exception as e:
+                        # In preview, we want to see the error, not crash
+                        processed.append({"_preview_error": f"Error: {str(e)}"})
+            return processed
+        except Exception as e:
+             # Return global error if loading fails
+             raise e