| import pandas as pd |
|
|
| def preprocess_data(data): |
| nc = len(data.columns) |
| nr = len(data.index) |
| new = [0] * nc |
| |
| for i in range(nc): |
| new[i] = len(data.iloc[:, i].unique()) / nr |
| |
| sorted_index = sorted(range(len(new)), key=lambda k: new[k], reverse=True) |
|
|
| sensitive_cols = list(data.columns[sorted_index[i]] for i in range(nc) if new[sorted_index[i]] > 0.5) |
| data = data.drop(columns=sensitive_cols) |
| |
| return data |
| |
|
|
|
|
| import transformers |
| import pandas as pd |
| import streamlit as st |
| from preprocess import preprocess_data |
| |
| def anonymize_text(text): |
| model_name = "distilbert-base-uncased" |
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) |
| model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) |
| |
| input_ids = tokenizer.encode(text, return_tensors="pt") |
| mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] |
| |
| token_logits = model(input_ids)[0] |
| mask_token_logits = token_logits[0, mask_token_index, :] |
| |
| top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() |
| |
| anonymized_text = [] |
| for token in top_5_tokens: |
| token = tokenizer.decode([token]) |
| anonymized_text.append(token) |
| |
| return anonymized_text |
| |
| def run_app(): |
| st.title("Text Anonymization App") |
| |
| # File upload |
| st.subheader("Upload your data") |
| file = st.file_uploader("Upload CSV", type=["csv"]) |
|
|
| if file is not None: |
| # Read the file |
| data = pd.read_csv(file) |
| |
| # Preprocess the data |
| preprocessed_data = preprocess_data(data) |
| |
| # Column selection |
| st.subheader("Select columns to anonymize") |
| selected_columns = [] |
| for col in preprocessed_data.columns: |
| if st.checkbox(col): |
| selected_columns.append(col) |
| |
| # |
| |