| |
| import pandas as pd |
| from sklearn.model_selection import train_test_split, GridSearchCV |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.metrics import accuracy_score, f1_score |
| import joblib |
| from scipy.sparse import hstack |
|
|
|
|
| |
|
|
| from collections import defaultdict |
|
|
| def split_train_left_right(data): |
| sorted = data.sort_values(['Tag', 'Affix']) |
| sorted = sorted.drop_duplicates(subset=['Word', 'Tag']) |
|
|
| tags = defaultdict(list) |
|
|
| left = [] |
| right = [] |
|
|
| for i, row in sorted.iterrows(): |
| |
| word = row['Word'] |
| tag = row['Tag'] |
| |
| if tags[word] and (tag not in tags[word]): |
| |
| left.append(row) |
| else: |
| right.append(row) |
|
|
| tags[word].append(tag) |
|
|
| right_df = pd.DataFrame(right) |
| left_df = pd.DataFrame(left) |
|
|
| return right_df, left_df |
|
|
| filepath = "train_fixed.csv" |
| data = pd.read_csv(filepath) |
|
|
| right_df, left_df = split_train_left_right(data) |
| |
| |
|
|
|
|
| |
| for (side, df) in [('right', right_df), ('left', left_df)]: |
| |
| categories = df["PoS_word"].unique() |
|
|
| category_res = {} |
|
|
| for category in categories: |
| print(f"Category: {category}") |
|
|
| |
| category_data = df[df["PoS_word"] == category] |
| print(category_data.shape) |
|
|
| category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x)) |
| category_data['word_length'] = category_data['Word'].apply(lambda x: len(x)) |
| category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү')) |
| category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө')) |
|
|
| |
| X = category_data["Affix"] |
| y = category_data["Tag"] |
| |
|
|
| |
| vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 5)) |
| X_train_tfidf = vectorizer.fit_transform(X) |
| |
|
|
| X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]]) |
| |
|
|
| |
|
|
| model = RandomForestClassifier(n_estimators=300) |
| model.fit(X_train_combined, y) |
|
|
| |
| |
|
|
| |
| y_pred = model.predict(X_train_combined) |
|
|
| |
| |
| |
| category_data['pred'] = y_pred |
| category_res[category] = category_data |
|
|
|
|
| |
| accuracy = accuracy_score(y, y_pred) |
| f1 = f1_score(y, y_pred, average="weighted") |
|
|
|
|
|
|
| print("Accuracy:", accuracy) |
| print("F1 Score:", f1) |
| print(model) |
|
|
| |
| |
| model_filepath = f"artefacts/model_{category}_{side}.joblib" |
| vectorizer_filepath = f"artefacts/vectorizer_{category}_{side}.joblib" |
| joblib.dump(model, model_filepath) |
| joblib.dump(vectorizer, vectorizer_filepath) |
|
|
| |
| filepath = "test_fixed.csv" |
| data = pd.read_csv(filepath) |
|
|
|
|
| def split_test_left_right(data): |
| sorted = data.sort_values(['Affix']) |
| |
|
|
| tags = defaultdict(list) |
|
|
| left = [] |
| right = [] |
|
|
| for i, row in sorted.iterrows(): |
| word = row['Word'] |
|
|
| if tags[word]: |
| |
| left.append(row) |
| else: |
| right.append(row) |
| tags[word].append(word) |
|
|
|
|
| right_df = pd.DataFrame(right) |
| left_df = pd.DataFrame(left) |
|
|
| return right_df, left_df |
|
|
| right_df, left_df = split_test_left_right(data) |
| |
| |
| |
|
|
| |
| result_dfs = [] |
| for (side, df) in [('right', right_df), ('left', left_df)]: |
| |
| print(side) |
| categories = df["PoS_word"].unique() |
|
|
| |
|
|
| for category in categories: |
| print(f"Category: {category}, side: {side}") |
|
|
| |
| category_data = df[df["PoS_word"] == category] |
| print(category_data.shape) |
|
|
|
|
| category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x)) |
| category_data['word_length'] = category_data['Word'].apply(lambda x: len(x)) |
| category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү')) |
| category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө')) |
|
|
|
|
| |
| X = category_data["Affix"] |
| y = category_data["Tag"] |
| |
|
|
|
|
|
|
| |
| vectorizer = joblib.load(f"artefacts/vectorizer_{category}_{side}.joblib") |
| X_train_tfidf = vectorizer.transform(X) |
|
|
|
|
| |
|
|
| model = joblib.load(f"artefacts/model_{category}_{side}.joblib") |
|
|
| |
| |
|
|
| X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]]) |
| |
| |
| y_pred = model.predict(X_train_combined) |
|
|
| category_data['Tag'] = y_pred |
| result_dfs.append(category_data) |
| |
|
|
| pd.concat(result_dfs).to_csv('file_pred_12.csv', index=False) |
|
|
| |