Upload 8 files

Browse files

Files changed (8) hide show

custom_datasets/__init__.py +2 -0
custom_datasets/discretized_cifar10.py +0 -0
custom_datasets/ten_species_dataset.py +0 -0
notebooks/eval_hyenadna_classifier.ipynb +196 -0
notebooks/qm9_data_prep.ipynb +428 -0
notebooks/qm9_vocab.json +32 -0
notebooks/zinc250k_data_prep.ipynb +411 -0
notebooks/zinc250k_vocab.json +64 -0

custom_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import discretized_cifar10
2	+ from . import ten_species_dataset

custom_datasets/discretized_cifar10.py ADDED Viewed

File without changes

custom_datasets/ten_species_dataset.py ADDED Viewed

File without changes

notebooks/eval_hyenadna_classifier.ipynb ADDED Viewed

	@@ -0,0 +1,196 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "5b178466-559f-47ed-bcd1-a171641d47b5",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "\n",
+    "import hydra\n",
+    "import numpy as np\n",
+    "import omegaconf\n",
+    "import torch\n",
+    "import transformers\n",
+    "from sklearn.metrics import f1_score, matthews_corrcoef, precision_score, recall_score\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "import classifier\n",
+    "import dataloader"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "08301e02-d279-426f-8aad-c23eea8fb120",
+   "metadata": {},
+   "source": [
+    "omegaconf.OmegaConf.register_new_resolver(\n",
+    "  'cwd', os.getcwd)\n",
+    "omegaconf.OmegaConf.register_new_resolver(\n",
+    "  'device_count', torch.cuda.device_count)\n",
+    "omegaconf.OmegaConf.register_new_resolver(\n",
+    "  'eval', eval)\n",
+    "omegaconf.OmegaConf.register_new_resolver(\n",
+    "  'div_up', lambda x, y: (x + y - 1) // y)\n",
+    "omegaconf.OmegaConf.register_new_resolver(\n",
+    "  'if_then_else',\n",
+    "  lambda condition, x, y: x if condition else y\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "4685c167-63c8-4912-81e0-4ecd635fcc24",
+   "metadata": {},
+   "source": [
+    "# Load classifier\n",
+    "with hydra.initialize(version_base=None, config_path='../configs/'):\n",
+    "    classifier_config = hydra.compose(\n",
+    "        config_name='config',\n",
+    "        overrides=[\n",
+    "            'hydra.output_subdir=null',\n",
+    "            f\"hydra.run.dir={os.path.dirname(os.getcwd())}/outputs/ten_species/eval_classifier/hyenadna-small-32k_from-scratch_nlayer-8\",\n",
+    "            'hydra/job_logging=disabled',\n",
+    "            'hydra/hydra_logging=disabled',\n",
+    "            '+is_eval_classifier=True',\n",
+    "            'mode=train_classifier',\n",
+    "            'loader.global_batch_size=32',\n",
+    "            'loader.eval_global_batch_size=64',\n",
+    "            'loader.batch_size=1',\n",
+    "            'loader.eval_batch_size=1',\n",
+    "            'data=ten_species',\n",
+    "            'data.label_col=species_label',\n",
+    "            'data.num_classes=10',\n",
+    "            'classifier_model=hyenadna-classifier',\n",
+    "            'classifier_model.hyena_model_name_or_path=LongSafari/hyenadna-small-32k-seqlen-hf',\n",
+    "            'classifier_model.n_layer=8',\n",
+    "            'classifier_backbone=hyenadna',\n",
+    "            'model.length=32768',\n",
+    "            'diffusion=null',\n",
+    "            'T=null',\n",
+    "            f\"eval.checkpoint_path={os.path.dirname(os.getcwd())}/outputs/ten_species/eval_classifier/hyenadna-small-32k_from-scratch_nlayer-8/checkpoints/best.ckpt\",\n",
+    "        ]\n",
+    "    )\n",
+    "classifier_config = omegaconf.OmegaConf.create(classifier_config)\n",
+    "tokenizer = transformers.AutoTokenizer.from_pretrained(classifier_config.data.tokenizer_name_or_path, trust_remote_code=True)\n",
+    "pretrained_classifier = classifier.Classifier.load_from_checkpoint(\n",
+    "    classifier_config.eval.checkpoint_path,\n",
+    "    tokenizer=tokenizer,\n",
+    "    config=classifier_config, logger=False)\n",
+    "pretrained_classifier.eval();"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "bf18720b-64a9-4e9e-9e1e-2aa1c12dc6f0",
+   "metadata": {},
+   "source": [
+    "tokenizer = dataloader.get_tokenizer(classifier_config)\n",
+    "_, val_dl = dataloader.get_dataloaders(\n",
+    "    classifier_config, tokenizer, skip_train=True, valid_seed=classifier_config.seed)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "bdcd3ba7-e26a-4e36-a5fb-ff1fb747cc3c",
+   "metadata": {},
+   "source": [
+    "labels = []\n",
+    "preds = []\n",
+    "for batch in tqdm(val_dl):\n",
+    "    preds.append(\n",
+    "        pretrained_classifier(batch['input_ids'].to(pretrained_classifier.device)).argmax(dim=-1).detach().to(\n",
+    "            'cpu', non_blocking=True).numpy()\n",
+    "    )\n",
+    "    labels.append(batch['species_label'].numpy())"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "110ed75e-613c-4b6a-bb79-15517988735c",
+   "metadata": {},
+   "source": [
+    "labels = np.concatenate(labels)\n",
+    "preds = np.concatenate(preds)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "1558ca2e-6454-4c8c-b141-fca77f0025c5",
+   "metadata": {},
+   "source": [
+    "overall_accuracy_score = (preds == labels).sum() / preds.size\n",
+    "overall_f1_score = f1_score(y_pred=preds, y_true=labels, average=\"macro\", labels=list(range(classifier_config.data.num_classes)))\n",
+    "overall_mcc_score = matthews_corrcoef(y_pred=preds, y_true=labels)\n",
+    "\n",
+    "print(f\"Overall Acc: {overall_accuracy_score:0.3f}\")\n",
+    "print(f\"Overall F1:  {overall_f1_score:0.3f}\")\n",
+    "print(f\"Overall MCC: {overall_mcc_score:0.3f}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "df8ce828-f6e1-4167-bae2-db4f13900758",
+   "metadata": {},
+   "source": [
+    "f1_scores = f1_score(y_pred=preds, y_true=labels, average=None , labels=list(range(classifier_config.data.num_classes)))\n",
+    "precision_scores = precision_score(y_pred=preds, y_true=labels, average=None , labels=list(range(classifier_config.data.num_classes)))\n",
+    "recall_scores = recall_score(y_pred=preds, y_true=labels, average=None , labels=list(range(classifier_config.data.num_classes)))\n",
+    "\n",
+    "species_list = ['Homo_sapiens', 'Mus_musculus', 'Drosophila_melanogaster', 'Danio_rerio',\n",
+    "                'Caenorhabditis_elegans', 'Gallus_gallus', 'Gorilla_gorilla', 'Felis_catus',\n",
+    "                'Salmo_trutta', 'Arabidopsis_thaliana']\n",
+    "for s in range(classifier_config.data.num_classes):\n",
+    "    print(f\"Class {s} - {species_list[s]}:\")\n",
+    "    print(f\"   F1:        {f1_scores[s]:0.3f}\")\n",
+    "    print(f\"   Precision: {precision_scores[s]:0.3f}\")\n",
+    "    print(f\"   Recall:    {recall_scores[s]:0.3f}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "d18ca7cc-4fe6-4ba9-9175-1eac9ebca7b1",
+   "metadata": {},
+   "source": [],
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/qm9_data_prep.ipynb ADDED Viewed

	@@ -0,0 +1,428 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5fa7908c-b785-49ce-9e5d-7c6ad6b4378b",
+   "metadata": {},
+   "source": [
+    "## Imports and setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d0c96204-ea08-4330-b1bb-784b259ec32e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6813e76b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
+      "Token is valid (permission: write).\n",
+      "Your token has been saved to /share/kuleshov/yzs2/discrete-guidance/.hf_cache/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "if os.path.exists(os.path.join(os.environ['HF_HOME'], 'token')):\n",
+    "    with open(os.path.join(os.environ['HF_HOME'], 'token'), 'r') as f:\n",
+    "        token = f.read().strip()\n",
+    "else:\n",
+    "    token = None\n",
+    "huggingface_hub.login(token=token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "61cb2ac4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import typing\n",
+    "\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import rdkit\n",
+    "import transformers\n",
+    "from rdkit import Chem as rdChem\n",
+    "from rdkit.Chem import Crippen, QED\n",
+    "from rdkit.Contrib.NP_Score import npscorer\n",
+    "from rdkit.Contrib.SA_Score import sascorer\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "24444c85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Update to 2024.03.6 release when available instead of suppressing warning!\n",
+    "#  See: https://github.com/rdkit/rdkit/issues/7625#\n",
+    "rdkit.rdBase.DisableLog('rdApp.warning')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "902de4c5-dda5-4e4c-a4dd-f3b88015464e",
+   "metadata": {},
+   "source": [
+    "## Create dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b7a8986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_float(\n",
+    "    s: str\n",
+    ") -> float:\n",
+    "    \"\"\"Parses floats potentially written as exponentiated values.\n",
+    "    \n",
+    "        Copied from https://www.kaggle.com/code/tawe141/extracting-data-from-qm9-xyz-files/code\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        return float(s)\n",
+    "    except ValueError:\n",
+    "        base, power = s.split('*^')\n",
+    "        return float(base) * 10**float(power)\n",
+    "\n",
+    "\n",
+    "def count_rings_and_bonds(\n",
+    "    mol: rdChem.Mol, max_ring_size: int = -1\n",
+    ") -> typing.Dict[str, int]:\n",
+    "    \"\"\"Counts bond and ring (by type).\"\"\"\n",
+    "    \n",
+    "    # Counting rings\n",
+    "    ssr = rdChem.GetSymmSSSR(mol)\n",
+    "    ring_count = len(ssr)\n",
+    "    \n",
+    "    ring_sizes = {} if max_ring_size < 0 else {i: 0 for i in range(3, max_ring_size+1)}\n",
+    "    for ring in ssr:\n",
+    "        ring_size = len(ring)\n",
+    "        if ring_size not in ring_sizes:\n",
+    "            ring_sizes[ring_size] = 0\n",
+    "        ring_sizes[ring_size] += 1\n",
+    "    \n",
+    "    # Counting bond types\n",
+    "    bond_counts = {\n",
+    "        'single': 0,\n",
+    "        'double': 0,\n",
+    "        'triple': 0,\n",
+    "        'aromatic': 0\n",
+    "    }\n",
+    "    \n",
+    "    for bond in mol.GetBonds():\n",
+    "        if bond.GetIsAromatic():\n",
+    "            bond_counts['aromatic'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.SINGLE:\n",
+    "            bond_counts['single'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.DOUBLE:\n",
+    "            bond_counts['double'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.TRIPLE:\n",
+    "            bond_counts['triple'] += 1\n",
+    "    result = {\n",
+    "        'ring_count': ring_count,\n",
+    "    }\n",
+    "    for k, v in ring_sizes.items():\n",
+    "        result[f\"R{k}\"] = v\n",
+    "\n",
+    "    for k, v in bond_counts.items():\n",
+    "        result[f\"{k}_bond\"] = v\n",
+    "    return result\n",
+    "\n",
+    "\n",
+    "def parse_xyz(\n",
+    "    filename: str,\n",
+    "    max_ring_size: int = -1,\n",
+    "    npscorer_model: typing.Optional[dict] = None,\n",
+    "    array_format: str = 'np'\n",
+    ") -> typing.Dict[str, typing.Any]:\n",
+    "    \"\"\"Parses QM9 specific xyz files. \n",
+    "    \n",
+    "        See https://www.nature.com/articles/sdata201422/tables/2 for reference.\n",
+    "        Adapted from https://www.kaggle.com/code/tawe141/extracting-data-from-qm9-xyz-files/code\n",
+    "    \"\"\"\n",
+    "    assert array_format in ['np', 'pt'], \\\n",
+    "        f\"Invalid array_format: `{array_format}` provided. Must be one of `np` (numpy.array), `pt` (torch.tensor).\"\n",
+    "    \n",
+    "    num_atoms = 0\n",
+    "    scalar_properties = []\n",
+    "    atomic_symbols = []\n",
+    "    xyz = []\n",
+    "    charges = []\n",
+    "    harmonic_vibrational_frequencies = []\n",
+    "    smiles = ''\n",
+    "    inchi = ''\n",
+    "    with open(filename, 'r') as f:\n",
+    "        for line_num, line in enumerate(f):\n",
+    "            if line_num == 0:\n",
+    "                num_atoms = int(line)\n",
+    "            elif line_num == 1:\n",
+    "                scalar_properties = [float(i) for i in line.split()[2:]]\n",
+    "            elif 2 <= line_num <= 1 + num_atoms:\n",
+    "                atom_symbol, x, y, z, charge = line.split()\n",
+    "                atomic_symbols.append(atom_symbol)\n",
+    "                xyz.append([parse_float(x), parse_float(y), parse_float(z)])\n",
+    "                charges.append(parse_float(charge))\n",
+    "            elif line_num == num_atoms + 2:\n",
+    "                harmonic_vibrational_frequencies = [float(i) for i in line.split()]\n",
+    "            elif line_num == num_atoms + 3:\n",
+    "                smiles = line.split()[0]\n",
+    "            elif line_num == num_atoms + 4:\n",
+    "                inchi = line.split()[0]\n",
+    "\n",
+    "    array_wrap = np.array if array_format == 'np' else torch.tensor\n",
+    "    result = {\n",
+    "        'num_atoms': num_atoms,\n",
+    "        'atomic_symbols': atomic_symbols,\n",
+    "        'pos': array_wrap(xyz),\n",
+    "        'charges': array_wrap(charges),\n",
+    "        'harmonic_oscillator_frequencies': array_wrap(harmonic_vibrational_frequencies),\n",
+    "        'smiles': smiles,\n",
+    "        'inchi': inchi\n",
+    "    }\n",
+    "    scalar_property_labels = [\n",
+    "        'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u', 'h', 'g', 'cv'\n",
+    "    ]    \n",
+    "    scalar_properties = dict(zip(scalar_property_labels, scalar_properties))\n",
+    "    result.update(scalar_properties)\n",
+    "\n",
+    "    # RdKit\n",
+    "    result['canonical_smiles'] = rdChem.CanonSmiles(result['smiles'])\n",
+    "    m = rdChem.MolFromSmiles(result['canonical_smiles'])\n",
+    "    result['logP'] = Crippen.MolLogP(m)\n",
+    "    result['qed'] = QED.qed(m)\n",
+    "    if npscorer_model is not None:\n",
+    "        result['np_score'] = npscorer.scoreMol(m, npscorer_model)\n",
+    "    result['sa_score'] = sascorer.calculateScore(m)\n",
+    "    result.update(count_rings_and_bonds(m, max_ring_size=max_ring_size))\n",
+    "    \n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72254d85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "    Download xyz files from:\n",
+    "        https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904\n",
+    "    > wget https://figshare.com/ndownloader/files/3195389/dsgdb9nsd.xyz.tar.bz2\n",
+    "    > mkdir dsgdb9nsd.xyz\n",
+    "    > tar -xvjf dsgdb9nsd.xyz.tar.bz2 -C dsgdb9nsd.xyz\n",
+    "\"\"\"\n",
+    "MAX_RING_SIZE = 9\n",
+    "fscore = npscorer.readNPModel()\n",
+    "xyz_dir_path = '/Users/yairschiff/Downloads/dsgdb9nsd.xyz'\n",
+    "parsed_xyz = []\n",
+    "for file in tqdm(sorted(os.listdir(xyz_dir_path)), desc='Parsing'):\n",
+    "    parsed = parse_xyz(os.path.join(xyz_dir_path, file),\n",
+    "                       max_ring_size=MAX_RING_SIZE,\n",
+    "                       npscorer_model=fscore,\n",
+    "                       array_format='np')\n",
+    "    parsed_xyz.append(parsed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12969dd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qm9_df = pd.DataFrame(data=parsed_xyz)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eed4f163",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Conversion below is needed to avoid:\n",
+    "#   `ArrowInvalid: ('Can only convert 1-dimensional array values',\n",
+    "#   'Conversion failed for column pos with type object')`\n",
+    "qm9_df['pos'] = qm9_df['pos'].apply(lambda x: [xi for xi in x])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c912d23a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = datasets.Dataset.from_pandas(qm9_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a7df506",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.push_to_hub('yairschiff/qm9')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86c4e1ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Random train/test splits as recommended by:\n",
+    "# #   https://moleculenet.org/datasets-1\n",
+    "# test_size = 0.1\n",
+    "# seed = 1\n",
+    "# dataset.train_test_split(test_size=test_size, seed=seed)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e982da1b-05ab-493b-bb82-8bf1225dcb2b",
+   "metadata": {},
+   "source": [
+    "## Create tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b0504e77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def smi_tokenizer(smi):\n",
+    "    \"\"\"Tokenize a SMILES molecule or reaction.\n",
+    "\n",
+    "        Copied from https://github.com/pschwllr/MolecularTransformer.\n",
+    "    \"\"\"\n",
+    "    import re\n",
+    "    pattern =  \"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\\\\\|\\/|:|~|@|\\?|>|\\*|\\$|\\%[0-9]{2}|[0-9])\"\n",
+    "    regex = re.compile(pattern)\n",
+    "    tokens = [token for token in regex.findall(smi)]\n",
+    "    assert smi == ''.join(tokens)\n",
+    "    return tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b89a4def-ea08-466a-8779-24acf75a2bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = datasets.load_dataset('yairschiff/qm9', split='train')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6ef61481-9384-4c1c-8361-ab858cb157ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # If vocab file not created yet, uncomment and run this cell\n",
+    "\n",
+    "# tokens = []\n",
+    "# for smi in dataset['canonical_smiles']:\n",
+    "#     tokens.extend(smi_tokenizer(smi))\n",
+    "\n",
+    "# with open('qm9_vocab.json', 'w', encoding='utf-8') as f:\n",
+    "#     f.write(\n",
+    "#         json.dumps(\n",
+    "#             {t: i for i, t in enumerate(sorted(set(tokens)))},\n",
+    "#             indent=2,\n",
+    "#             sort_keys=True,\n",
+    "#             ensure_ascii=False\n",
+    "#         ) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6af7fccb-08ee-4dc6-99dc-cfa4fc38074c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # If HF tokenizer not yet published, uncomment and run this cell\n",
+    "# import tokenizer\n",
+    "\n",
+    "# tokenizer.QM9Tokenizer.register_for_auto_class()\n",
+    "# qm9_tokenizer = tokenizer.QM9Tokenizer(vocab_file='qm9_vocab.json')\n",
+    "# qm9_tokenizer.push_to_hub('yairschiff/qm9-tokenizer')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "4cc39f16-b53c-481a-a35e-a42fb1b08378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test tokenizer\n",
+    "qm9_tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
+    "    'yairschiff/qm9-tokenizer', trust_remote_code=True, resume_download=None)\n",
+    "print(dataset[1000]['canonical_smiles'])\n",
+    "print(qm9_tokenizer.encode(dataset[1000]['canonical_smiles']))\n",
+    "print(qm9_tokenizer.decode(qm9_tokenizer.encode(dataset[1000]['canonical_smiles'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41752e94-175e-4f40-b9d2-496241eab0c0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/qm9_vocab.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "#": 0,
+  "(": 1,
+  ")": 2,
+  "-": 3,
+  "1": 4,
+  "2": 5,
+  "3": 6,
+  "4": 7,
+  "5": 8,
+  "=": 9,
+  "C": 10,
+  "F": 11,
+  "N": 12,
+  "O": 13,
+  "[C-]": 14,
+  "[CH-]": 15,
+  "[N+]": 16,
+  "[N-]": 17,
+  "[NH+]": 18,
+  "[NH2+]": 19,
+  "[NH3+]": 20,
+  "[O-]": 21,
+  "[c-]": 22,
+  "[cH-]": 23,
+  "[n-]": 24,
+  "[nH+]": 25,
+  "[nH]": 26,
+  "c": 27,
+  "n": 28,
+  "o": 29
+}

notebooks/zinc250k_data_prep.ipynb ADDED Viewed

	@@ -0,0 +1,411 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fa328603-9e2b-4643-8500-ec11c51b5223",
+   "metadata": {},
+   "source": [
+    "## Imports and setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7716fb32-a805-4888-9dac-da4cff4f6e40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "432e1636",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
+      "Token is valid (permission: write).\n",
+      "Your token has been saved to /share/kuleshov/yzs2/discrete-guidance/.hf_cache/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "if os.path.exists(os.path.join(os.environ['HF_HOME'], 'token')):\n",
+    "    with open(os.path.join(os.environ['HF_HOME'], 'token'), 'r') as f:\n",
+    "        token = f.read().strip()\n",
+    "else:\n",
+    "    token = None\n",
+    "huggingface_hub.login(token=token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "e22e86ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import re\n",
+    "import typing\n",
+    "\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import rdkit\n",
+    "import transformers\n",
+    "from rdkit import Chem as rdChem\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "aaa00828",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: Update to 2024.03.6 release when available instead of suppressing warning!\n",
+    "#  See: https://github.com/rdkit/rdkit/issues/7625#\n",
+    "rdkit.rdBase.DisableLog('rdApp.warning')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a878a71-d33f-43fe-955d-4250950b1eec",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Create dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26856fe2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_rings_and_bonds(\n",
+    "    mol: rdChem.Mol\n",
+    ") -> typing.Dict[str, int]:\n",
+    "    \"\"\"Counts bond and ring (by type).\"\"\"\n",
+    "    \n",
+    "    # Counting rings\n",
+    "    ssr = rdChem.GetSymmSSSR(mol)\n",
+    "    ring_count = len(ssr)\n",
+    "    \n",
+    "    ring_sizes = {}\n",
+    "    for ring in ssr:\n",
+    "        ring_size = len(ring)\n",
+    "        if ring_size not in ring_sizes:\n",
+    "            ring_sizes[ring_size] = 0\n",
+    "        ring_sizes[ring_size] += 1\n",
+    "    \n",
+    "    # Counting bond types\n",
+    "    bond_counts = {\n",
+    "        'single': 0,\n",
+    "        'double': 0,\n",
+    "        'triple': 0,\n",
+    "        'aromatic': 0\n",
+    "    }\n",
+    "    \n",
+    "    for bond in mol.GetBonds():\n",
+    "        if bond.GetIsAromatic():\n",
+    "            bond_counts['aromatic'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.SINGLE:\n",
+    "            bond_counts['single'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.DOUBLE:\n",
+    "            bond_counts['double'] += 1\n",
+    "        elif bond.GetBondType() == rdChem.BondType.TRIPLE:\n",
+    "            bond_counts['triple'] += 1\n",
+    "    result = {\n",
+    "        'ring_count': ring_count,\n",
+    "    }\n",
+    "    for k, v in ring_sizes.items():\n",
+    "        result[f\"R{k}\"] = v\n",
+    "\n",
+    "    for k, v in bond_counts.items():\n",
+    "        result[f\"{k}_bond\"] = v\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbde53f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "    Download data and validation indices from:\n",
+    "        \"Score-based Generative Modeling of Graphs via the System of Stochastic Differential Equations\"\n",
+    "        https://github.com/harryjo97/GDSS\n",
+    "    > wget wget https://raw.githubusercontent.com/harryjo97/GDSS/master/data/zinc250k.csv\n",
+    "    > wget https://raw.githubusercontent.com/harryjo97/GDSS/master/data/valid_idx_zinc250k.json\n",
+    "\"\"\"\n",
+    "df = pd.read_csv('/Users/yairschiff/Downloads/zinc250k.csv', index_col=0, encoding='utf_8')\n",
+    "feats = []\n",
+    "for i, row in tqdm(df.iterrows(), total=len(df), desc='RDKit feats', leave=False):\n",
+    "    feat = {'smiles': row['smiles']}\n",
+    "    feat['canonical_smiles'] = rdChem.CanonSmiles(feat['smiles'])\n",
+    "    m = rdChem.MolFromSmiles(feat['canonical_smiles'])\n",
+    "    feat.update(count_rings_and_bonds(m))\n",
+    "    feats.append(feat)\n",
+    "df = pd.merge(df, pd.DataFrame.from_records(feats), on='smiles')\n",
+    "df = df.fillna(0)\n",
+    "for col in df.columns:  # recast ring counts as int\n",
+    "    if re.search(\"^R[0-9]+$\", col) is not None:\n",
+    "        df[col] = df[col].astype(int)\n",
+    "# Re-order columns\n",
+    "df = df[\n",
+    "    ['smiles', 'logP', 'qed', 'SAS', 'canonical_smiles',\n",
+    "     'single_bond', 'double_bond', 'triple_bond', 'aromatic_bond',\n",
+    "     'ring_count','R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R12', 'R13', 'R14', 'R15', 'R18', 'R24']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e2d5955",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read in validation indices\n",
+    "with open('/Users/yairschiff/Downloads/valid_idx_zinc250k.json', 'r') as f:\n",
+    "    valid_idxs = json.load(f)\n",
+    "df['validation'] = df.index.isin(valid_idxs).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b89b732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create HF dataset\n",
+    "dataset = datasets.DatasetDict({\n",
+    "    'train': datasets.Dataset.from_pandas(df[df['validation'] == 0].drop(columns=['validation'])),\n",
+    "    'validation': datasets.Dataset.from_pandas(df[df['validation'] == 1].drop(columns=['validation'])),\n",
+    "})\n",
+    "dataset = dataset.remove_columns('__index_level_0__')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1efb5845",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.push_to_hub('yairschiff/zinc250k')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5c6f357d-20d9-4004-8091-68726b6b4c86",
+   "metadata": {},
+   "source": [
+    "## Create tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6642fc9d-4863-4e14-947b-95bae48e192d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def smi_tokenizer(smi):\n",
+    "    \"\"\"Tokenize a SMILES molecule or reaction.\n",
+    "\n",
+    "        Copied from https://github.com/pschwllr/MolecularTransformer.\n",
+    "    \"\"\"\n",
+    "    import re\n",
+    "    pattern =  \"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\\\\\|\\/|:|~|@|\\?|>|\\*|\\$|\\%[0-9]{2}|[0-9])\"\n",
+    "    regex = re.compile(pattern)\n",
+    "    tokens = [token for token in regex.findall(smi)]\n",
+    "    assert smi == ''.join(tokens)\n",
+    "    return tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "3a9e2e60-8596-4a91-acc3-d43e166ce723",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = datasets.load_dataset('yairschiff/zinc250k')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fbd5c2fe-4318-46bb-bc43-6ef7fe76e9fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # If vocab file not created yet, uncomment and run this cell\n",
+    "\n",
+    "# tokens = []\n",
+    "# for split in dataset.keys():\n",
+    "#     for smi in dataset[split]['canonical_smiles']:\n",
+    "#         tokens.extend(smi_tokenizer(smi))\n",
+    "\n",
+    "# with open('zinc250k_vocab.json', 'w', encoding='utf-8') as f:\n",
+    "#     f.write(\n",
+    "#         json.dumps(\n",
+    "#             {t: i for i, t in enumerate(sorted(set(tokens)))},\n",
+    "#             indent=2,\n",
+    "#             sort_keys=True,\n",
+    "#             ensure_ascii=False\n",
+    "#         ) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4962478b-5343-4838-befe-64a5389625d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/yairschiff/zinc250k-tokenizer/commit/7a07b0165a8a4f14f09d6137da8cdabf789397fd', commit_message='Upload tokenizer', commit_description='', oid='7a07b0165a8a4f14f09d6137da8cdabf789397fd', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# # If HF tokenizer not yet published, uncomment and run this cell\n",
+    "# import tokenizer\n",
+    "\n",
+    "# tokenizer.Zinc250kTokenizer.register_for_auto_class()\n",
+    "# zinc250k_tokenizer = tokenizer.Zinc250kTokenizer(vocab_file='zinc250k_vocab.json')\n",
+    "# zinc250k_tokenizer.push_to_hub('yairschiff/zinc250k-tokenizer')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "a779aa57-0c9d-4b8c-bf11-ccc5ab4c462e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cn1ncc2c1CCC[C@H]2NC(=O)NC[C@H](O)COc1ccc(F)cc1\n",
+      "[0, 25, 69, 15, 69, 68, 68, 16, 68, 15, 25, 25, 25, 35, 16, 29, 25, 11, 23, 30, 12, 29, 25, 35, 11, 30, 12, 25, 30, 68, 15, 68, 68, 68, 11, 27, 12, 68, 68, 15, 1]\n",
+      "<bos>Cn1ncc2c1CCC[C@H]2NC(=O)NC[C@H](O)COc1ccc(F)cc1<eos>\n",
+      "Cn1ncc2c1CCC[C@H]2NC(=O)NC[C@H](O)COc1ccc(F)cc1\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test tokenizer\n",
+    "zinc250k_tokenizer = transformers.AutoTokenizer.from_pretrained(\n",
+    "    'yairschiff/zinc250k-tokenizer', trust_remote_code=True, resume_download=None)\n",
+    "print(dataset['train'][1000]['canonical_smiles'])\n",
+    "print(zinc250k_tokenizer.encode(dataset['train'][1000]['canonical_smiles']))\n",
+    "print(zinc250k_tokenizer.decode(zinc250k_tokenizer.encode(dataset['train'][1000]['canonical_smiles'])))\n",
+    "print(zinc250k_tokenizer.decode(zinc250k_tokenizer.encode(dataset['train'][1000]['canonical_smiles'], add_special_tokens=False)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "f3a15585-8e75-409d-9afe-0e7fe4a0bffc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/224568 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/24887 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(array([  152,  3351, 21311, 47185, 67972, 70367, 25030, 11778,  2179,\n",
+      "         130]), array([10. , 16.4, 22.8, 29.2, 35.6, 42. , 48.4, 54.8, 61.2, 67.6, 74. ]))\n",
+      "10\n",
+      "74\n"
+     ]
+    }
+   ],
+   "source": [
+    "lengths = [len(zinc250k_tokenizer.encode(i['canonical_smiles'])) for i in tqdm(dataset['train'], leave=False)]\n",
+    "lengths += [len(zinc250k_tokenizer.encode(i['canonical_smiles'])) for i in tqdm(dataset['validation'], leave=False)]\n",
+    "print(np.histogram(lengths))\n",
+    "print(min(lengths))\n",
+    "print(max(lengths))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7a6e081-4961-4cf4-a19d-0375bedd7dab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/zinc250k_vocab.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "#": 0,
+  "(": 1,
+  ")": 2,
+  "-": 3,
+  "/": 4,
+  "1": 5,
+  "2": 6,
+  "3": 7,
+  "4": 8,
+  "5": 9,
+  "6": 10,
+  "7": 11,
+  "8": 12,
+  "=": 13,
+  "Br": 14,
+  "C": 15,
+  "Cl": 16,
+  "F": 17,
+  "I": 18,
+  "N": 19,
+  "O": 20,
+  "P": 21,
+  "S": 22,
+  "[C@@H]": 23,
+  "[C@@]": 24,
+  "[C@H]": 25,
+  "[C@]": 26,
+  "[CH-]": 27,
+  "[CH2-]": 28,
+  "[N+]": 29,
+  "[N-]": 30,
+  "[NH+]": 31,
+  "[NH-]": 32,
+  "[NH2+]": 33,
+  "[NH3+]": 34,
+  "[O+]": 35,
+  "[O-]": 36,
+  "[OH+]": 37,
+  "[P+]": 38,
+  "[P@@H]": 39,
+  "[P@@]": 40,
+  "[P@]": 41,
+  "[PH+]": 42,
+  "[PH2]": 43,
+  "[PH]": 44,
+  "[S+]": 45,
+  "[S-]": 46,
+  "[S@@+]": 47,
+  "[S@@]": 48,
+  "[S@]": 49,
+  "[SH+]": 50,
+  "[n+]": 51,
+  "[n-]": 52,
+  "[nH+]": 53,
+  "[nH]": 54,
+  "[o+]": 55,
+  "[s+]": 56,
+  "\\": 57,
+  "c": 58,
+  "n": 59,
+  "o": 60,
+  "s": 61
+}