jasminsongsimin commited on
Commit
d8e466f
·
verified ·
1 Parent(s): f0754d3

Upload 3 files

Browse files
Files changed (3) hide show
  1. handler.py +19 -42
  2. modeling_minicpm.py +3 -3
  3. requirements.txt +5 -0
handler.py CHANGED
@@ -1,68 +1,45 @@
1
- import os
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
4
 
5
- # 定义模型处理类
6
- class ModelHandler(object):
7
- def __init__(self):
8
  self.tokenizer = None
9
  self.model = None
10
  self.device = None
 
11
 
12
  def load_model(self, model_dir):
13
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
-
15
  model_path = model_dir
16
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
17
- self.model = AutoModelForCausalLM.from_pretrained(model_path).to(self.device)
18
  self.model.eval()
19
-
20
  print(f"Tokenizer and Model loaded from: {model_path} to device: {self.device}")
21
 
22
 
23
- def preprocess(self, request):
24
- input_text = request.get("inputs", request.get("text"))
25
- if not input_text:
 
26
  raise ValueError("Input text is missing in the request. Please provide 'inputs' or 'text' in your request.")
27
 
28
- history = []
29
- history.append({"role": "user", "content": input_text})
 
30
  conversion = self.tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
31
  encoding = self.tokenizer(conversion, return_tensors="pt").to(self.device)
32
- return encoding
33
-
34
-
35
- def predict(self, model_input):
36
  with torch.no_grad():
37
  output = self.model.generate(
38
- **model_input,
39
  max_new_tokens=1024,
40
  temperature=1.5,
41
  do_sample=True,
42
  pad_token_id=self.tokenizer.eos_token_id
43
  )
44
- return output
45
-
46
-
47
- def postprocess(self, prediction):
48
- generated_text = self.tokenizer.decode(prediction[0], skip_special_tokens=True)
49
- return {"response": generated_text}
50
-
51
-
52
- _service = ModelHandler()
53
-
54
- def load():
55
- model_dir = '/home/aistudio/export'
56
- _service.load_model(model_dir)
57
-
58
-
59
- def preprocess(request):
60
- return _service.preprocess(request)
61
-
62
-
63
- def predict(data):
64
- return _service.predict(data)
65
-
66
 
67
- def postprocess(prediction):
68
- return _service.postprocess(prediction)
 
1
+ from typing import Dict, List, Any
2
+ import json
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
5
 
6
+ class EndpointHandler():
7
+ def __init__(self, path=""):
 
8
  self.tokenizer = None
9
  self.model = None
10
  self.device = None
11
+ self.load_model(path)
12
 
13
  def load_model(self, model_dir):
14
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
15
  model_path = model_dir
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
17
+ self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(self.device)
18
  self.model.eval()
 
19
  print(f"Tokenizer and Model loaded from: {model_path} to device: {self.device}")
20
 
21
 
22
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
23
+ inputs = data.pop("inputs", data)
24
+ print(f'get input {inputs}')
25
+ if not inputs:
26
  raise ValueError("Input text is missing in the request. Please provide 'inputs' or 'text' in your request.")
27
 
28
+ history = json.loads(inputs)
29
+ print(f'history is {history}')
30
+ #history.append({"role": "user", "content": inputs})
31
  conversion = self.tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
32
  encoding = self.tokenizer(conversion, return_tensors="pt").to(self.device)
33
+ print(f'encoding success')
 
 
 
34
  with torch.no_grad():
35
  output = self.model.generate(
36
+ **encoding,
37
  max_new_tokens=1024,
38
  temperature=1.5,
39
  do_sample=True,
40
  pad_token_id=self.tokenizer.eos_token_id
41
  )
42
+ print(f'output success')
43
+ generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ return [{"response": generated_text}]
 
modeling_minicpm.py CHANGED
@@ -38,7 +38,7 @@ from transformers.modeling_attn_mask_utils import (
38
  )
39
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
40
  from transformers.modeling_utils import PreTrainedModel
41
- from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
42
  from transformers.utils import (
43
  add_start_docstrings,
44
  add_start_docstrings_to_model_forward,
@@ -61,8 +61,8 @@ except:
61
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
62
  # It means that the function will not be traced through and simply appear as a node in the graph.
63
  if is_torch_fx_available():
64
- if not is_torch_greater_or_equal_than_1_13:
65
- import torch.fx
66
 
67
  _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
68
 
 
38
  )
39
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
40
  from transformers.modeling_utils import PreTrainedModel
41
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
42
  from transformers.utils import (
43
  add_start_docstrings,
44
  add_start_docstrings_to_model_forward,
 
61
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
62
  # It means that the function will not be traced through and simply appear as a node in the graph.
63
  if is_torch_fx_available():
64
+ # if not is_torch_greater_or_equal_than_1_13:
65
+ # import torch.fx
66
 
67
  _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
68
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # for MiniCPM-2B hf inference
2
+ torch>=2.0.0
3
+ transformers>=4.36.2
4
+
5
+