| | import spacy |
| | from spacy.training import Example |
| | import jsonlines |
| | import random |
| |
|
| | |
| | nlp = spacy.blank("en") |
| |
|
| | |
| | textcat = nlp.add_pipe('textcat_multilabel', last=True) |
| | textcat.add_label("CapitalRequirements") |
| | textcat.add_label("ConsumerProtection") |
| | textcat.add_label("RiskManagement") |
| | textcat.add_label("ReportingAndCompliance") |
| | textcat.add_label("CorporateGovernance") |
| |
|
| | |
| | processed_data_file = "data/firstStep_file.jsonl" |
| |
|
| | |
| | with jsonlines.open(processed_data_file) as reader: |
| | processed_data = list(reader) |
| |
|
| | |
| | spacy_train_data = [] |
| | for obj in processed_data: |
| | text = obj["text"] |
| | label = { |
| | "CapitalRequirements": obj["label"] == "CapitalRequirements", |
| | "ConsumerProtection": obj["label"] == "ConsumerProtection", |
| | "RiskManagement": obj["label"] == "RiskManagement", |
| | "ReportingAndCompliance": obj["label"] == "ReportingAndCompliance", |
| | "CorporateGovernance": obj["label"] == "CorporateGovernance" |
| | } |
| | spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label})) |
| |
|
| | |
| | optimizer = nlp.initialize() |
| |
|
| | |
| | n_iter = 10 |
| | for i in range(n_iter): |
| | spacy.util.fix_random_seed(1) |
| | random.shuffle(spacy_train_data) |
| | losses = {} |
| | for batch in spacy.util.minibatch(spacy_train_data, size=8): |
| | nlp.update(batch, losses=losses, sgd=optimizer) |
| | print("Iteration:", i, "Losses:", losses) |
| |
|
| | |
| | output_dir = "./my_trained_model" |
| | nlp.to_disk(output_dir) |
| |
|