| --- |
| license: apache-2.0 |
| datasets: |
| - agentlans/high-quality-english-sentences |
| language: |
| - en |
| base_model: |
| - google-t5/t5-base |
| pipeline_tag: text2text-generation |
| library_name: transformers |
| --- |
| |
| This model is for typos in texts and it outputs corrected texts. |
|
|
| Example: |
|
|
| Text with Typos: **Whathvhr wh call owr carhaivhrs - doctors, nwrsh practitionhrs, clinicians, - wh nhhd thhm not only to carh, wh nhhd thhm to uh aulh to providh thh riaht valwh.** |
|
|
| Corrected Text: **Whatever we call our caregivers - doctors, nurse practitioners, clinicians, - we need them not only to care, we need them to be able to provide the right value.** |
|
|
|
|
| Example Usage: |
| ```py |
| #Load the model and tokenizer |
| text = "" #Text with typos here! |
| inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) |
| outputs = model.generate(inputs["input_ids"], max_length=256) |
| corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| ``` |
|
|
|
|
| Full Pipeline Usage: |
| ```py |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| import torch |
| from string import ascii_lowercase |
| import Levenshtein |
| import random |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
| tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng") |
| alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device) |
| correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device) |
| |
| def similarity_percentage(s1, s2): |
| distance = Levenshtein.distance(s1, s2) |
| |
| max_len = max(len(s1), len(s2)) |
| |
| similarity = (1 - distance / max_len) * 100 |
| |
| return similarity |
| |
| def decode(cipher_text, key): |
| decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])} |
| decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])}) |
| ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text)) |
| return ans |
| |
| def model_pass(model, input, max_length=256): |
| inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) |
| outputs = model.generate(inputs["input_ids"], max_length=max_length) |
| result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| return result |
| |
| def decipher(cipher_text, key) -> str: |
| decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])} |
| decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])}) |
| |
| result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0])) |
| |
| return result |
| |
| def cipher(plain_text) -> tuple[str, list]: |
| alphabet_map = list(ascii_lowercase) |
| random.shuffle(alphabet_map) |
| alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)} |
| |
| alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()}) |
| |
| cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text)) |
| return cipher_text, alphabet_map |
| |
| def correct_text(cipher_text, model_output): |
| cipher_text = cipher_text.split(' ') |
| model_output = model_output.split(' ') |
| |
| letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase} |
| |
| |
| # Levenstein distance for lenghts of words |
| n = len(cipher_text) |
| m = len(model_output) |
| |
| i = 0 |
| j = 0 |
| dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)] |
| |
| for i in range(n + 1): |
| dp[i][0] = i |
| |
| |
| for j in range(m + 1): |
| dp[0][j] = j |
| |
| for i in range(1, n + 1): |
| for j in range(1, m + 1): |
| if len(cipher_text[i - 1]) == len(model_output[j - 1]): |
| dp[i][j] = dp[i - 1][j - 1] |
| |
| else: |
| dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1 |
| |
| i = n |
| j = m |
| while i > 0 and j > 0: |
| |
| before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1]) |
| match before[0]: |
| case 0: |
| if dp[i - 1][j - 1] == dp[i][j]: |
| # If the same we add them to letter map |
| cipher = cipher_text[i-1] |
| model_o = model_output[j-1] |
| |
| for c_letter, m_letter in zip(cipher.lower(), model_o.lower()): |
| if c_letter in letter_map and m_letter in letter_map[c_letter]: |
| letter_map[c_letter][m_letter] += 1 |
| |
| i = i - 1 |
| j = j - 1 |
| case 1: |
| i = i - 1 |
| case 2: |
| j = j - 1 |
| |
| for letter in ascii_lowercase: |
| letter_sum = sum(letter_map[letter].values()) |
| if letter_sum == 0: |
| # That letter wasn't in the text |
| letter_map[letter] = None |
| continue |
| |
| # Sorted from most accuring to least |
| letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)] |
| |
| change_map = { |
| i : None for i in ascii_lowercase |
| } |
| |
| for i in range(len(ascii_lowercase)): |
| for letter in ascii_lowercase: |
| if letter_map[letter] is None: |
| continue # That letter wasn't in the text |
| |
| # If None then it didn't get substituted earlier |
| map_letter = letter_map[letter][i][0] |
| if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None |
| or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))): |
| change_map[map_letter] = (letter, i, letter_map[letter][i][1]) |
| # Letter, iteration, percentage |
| |
| change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None} |
| |
| for letter in ascii_lowercase: |
| if letter not in change_map: |
| change_map[letter] = '.' |
| |
| |
| # Add uppercases |
| change_map.update( |
| { |
| i[0].upper() : i[1].upper() for i in change_map.items() |
| } |
| ) |
| |
| new_text = [] |
| for cipher in cipher_text: |
| new_word = "" |
| for c_letter in cipher: |
| if c_letter in change_map: |
| new_word += change_map[c_letter] |
| |
| else: |
| new_word += c_letter |
| |
| |
| new_text.append(new_word) |
| |
| return ' '.join(new_text) |
| |
| def crack_sub(cipher_text): |
| output = model_pass(alphabet_model, cipher_text, 26) |
| decoded = decode(cipher_text, output) |
| second_pass = model_pass(correction_model, decoded, len(decoded)) |
| second_text = correct_text(cipher_text, second_pass) |
| third_pass = model_pass(correction_model, second_text, len(decoded)) |
| |
| return third_pass |
| |
| """ |
| Use crack_sub() function to solve monoalphabetic substitution ciphers! |
| """ |
| ``` |