| | |
| |
|
| | import torch |
| | import os |
| |
|
| | from modelscope.models.base import TorchModel |
| | from modelscope.preprocessors.base import Preprocessor |
| | from modelscope.pipelines.base import Model, Pipeline |
| | from modelscope.utils.config import Config |
| | from modelscope.pipelines.builder import PIPELINES |
| | from modelscope.preprocessors.builder import PREPROCESSORS |
| | from modelscope.models.builder import MODELS |
| | from modelscope.preprocessors.image import load_image |
| |
|
| |
|
| | from vlmo.utils.beit_utils import load_from_config |
| |
|
| |
|
| | @PIPELINES.register_module( |
| | "multi-modal-embeddings", module_name="multi-modal-embedding-pipeline" |
| | ) |
| | class MyCustomPipeline(Pipeline): |
| | """Give simple introduction to this pipeline. |
| | |
| | Examples: |
| | |
| | >>> from modelscope.pipelines import pipeline |
| | >>> input = "Hello, ModelScope!" |
| | >>> my_pipeline = pipeline('my-task', 'my-model-id') |
| | >>> result = my_pipeline(input) |
| | |
| | """ |
| |
|
| | def __init__(self, model, preprocessor=None, **kwargs): |
| | """ |
| | use `model` and `preprocessor` to create a custom pipeline for prediction |
| | Args: |
| | model: model id on modelscope hub. |
| | preprocessor: the class of method be init_preprocessor |
| | """ |
| | super().__init__(model=model, auto_collate=False) |
| | self.model_dir = model |
| | self._device = "cuda" if torch.cuda.is_available() else "cpu" |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | model_config = { |
| | "loss_names": {"itc": 1}, |
| | "beit_version": "large", |
| | "encoder_embed_dim": 1024, |
| | "out_embed_dim": 1024, |
| | "encoder_layers": 21, |
| | "beit3_vl_layers": 3, |
| | |
| | "visual_mask_size": 14, |
| | "tokenizer_type": "GLMChineseTokenizer", |
| | "tokenizer": os.path.join(self.model_dir, "./vlmo/tokenizer"), |
| | "vocab_size": 115244, |
| | "whole_word_masking": False, |
| | "precision": 32, |
| | "test_only": True, |
| | "flash_attn": True, |
| | "model_path": os.path.join(self.model_dir, "m2_encoder_1B.ckpt"), |
| | "modelscope": { |
| | "model_id": "M2Cognition/M2_Encoder_Large" |
| | }, |
| | "model_file": "m2_encoder_1B.ckpt" |
| | } |
| | model, processors = load_from_config(model_config) |
| | self.model = model |
| | self.model.to(self._device).eval() |
| | self._tokenizer, self._img_processor = processors |
| |
|
| | def _sanitize_parameters(self, **pipeline_parameters): |
| | """ |
| | this method should sanitize the keyword args to preprocessor params, |
| | forward params and postprocess params on '__call__' or '_process_single' method |
| | considered to be a normal classmethod with default implementation / output |
| | |
| | Default Returns: |
| | Dict[str, str]: preprocess_params = {} |
| | Dict[str, str]: forward_params = {} |
| | Dict[str, str]: postprocess_params = pipeline_parameters |
| | """ |
| | return {}, pipeline_parameters, {} |
| |
|
| | def _check_input(self, inputs): |
| | pass |
| |
|
| | def _check_output(self, outputs): |
| | pass |
| |
|
| | def forward(self, forward_params): |
| | """Provide default implementation using self.model and user can reimplement it""" |
| | |
| | labels = forward_params.get("label_list", "") |
| | labels = labels.split(",") |
| | if len(labels) > 1 and labels[0] != "": |
| | txt_encoding = self._tokenizer( |
| | labels, |
| | padding="max_length", |
| | truncation=True, |
| | max_length=self.model.hparams.config["max_text_len"], |
| | return_special_tokens_mask=True, |
| | ) |
| | txt_data = { |
| | "text_ids": torch.tensor(txt_encoding["input_ids"]).to(self._device), |
| | "text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
| | self._device |
| | ), |
| | "text_labels": None, |
| | } |
| | txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
| | image = forward_params["image"] |
| | image = load_image(image) |
| | img = self._img_processor(image).unsqueeze(0) |
| | img_data = {"image": [img.to(self._device)]} |
| | img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
| | logits_per_image = self.model.logit_scale.exp() * img_feats @ txt_feats.t() |
| | probs = logits_per_image.softmax(dim=-1).detach().cpu() |
| | index = probs.max(dim=-1)[1][0] |
| | label = labels[index] |
| | return {"text": label, "scores": probs.numpy().tolist()[0]} |
| | else: |
| | rets = {} |
| | if "text" in forward_params: |
| | text = forward_params.get("text") |
| | txt_encoding = self._tokenizer( |
| | text, |
| | padding="max_length", |
| | truncation=True, |
| | max_length=self.model.hparams.config["max_text_len"], |
| | return_special_tokens_mask=True, |
| | ) |
| | txt_data = { |
| | "text_ids": torch.tensor(txt_encoding["input_ids"]).to( |
| | self._device |
| | ), |
| | "text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
| | self._device |
| | ), |
| | "text_labels": None, |
| | } |
| | txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
| | rets.update({"text_embedding": txt_feats.detach()}) |
| | if "img" in forward_params: |
| | input_img = forward_params["img"] |
| | img = self._img_processor(input_img).unsqueeze(0) |
| | img_data = {"image": [img.to(self._device)]} |
| | img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
| | rets.update({"img_embedding": img_feats.detach()}) |
| |
|
| | return rets |
| |
|
| | def preprocess(self, inputs): |
| | return inputs |
| |
|
| | def postprocess(self, inputs): |
| | """If current pipeline support model reuse, common postprocess |
| | code should be write here. |
| | |
| | Args: |
| | inputs: input data |
| | |
| | Return: |
| | dict of results: a dict containing outputs of model, each |
| | output should have the standard output name. |
| | """ |
| | return inputs |
| |
|
| |
|
| | """ |
| | # Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id |
| | usr_config_path = "/tmp/snapdown/" |
| | config = Config( |
| | { |
| | "framework": "pytorch", |
| | "task": "multi-modal-embeddings", |
| | "model": {"type": "m2-encoder"}, |
| | "pipeline": {"type": "multi-modal-embedding-pipeline"}, |
| | "allow_remote": True, |
| | } |
| | ) |
| | config.dump("/tmp/snapdown/" + "configuration.json") |
| | """ |
| |
|
| | if __name__ == "__main__": |
| | from modelscope.pipelines import pipeline |
| | from modelscope.preprocessors.image import load_image |
| |
|
| | model = "M2Cognition/M2-Encoder" |
| | pipe = pipeline("multi-modal-embeddings", model=model) |
| | input = { |
| | "image": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg", |
| | "label_list": "杰尼龟,妙蛙种子,小火龙,皮卡丘", |
| | } |
| | demo = pipe(input) |
| | print("demo output", demo) |
| | inputs = {"text": ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]} |
| | output = pipe(inputs) |
| | print("text output", output) |
| | input_img = load_image( |
| | "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" |
| | ) |
| | inputs = {"img": input_img} |
| | img_embedding = pipe(inputs) |
| | print("image output", img_embedding) |
| |
|