Wauplin HF Staff commited on
Commit
e39cda7
·
verified ·
1 Parent(s): cf05e76

pin transformers to PR #45547 merge commit (auto-detect disable_mmap on hf-mount)

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -1,8 +1,13 @@
1
  import os
 
2
  import time
3
  import traceback
4
 
 
 
 
5
  T_START = time.time()
 
6
 
7
  LOAD_STRATEGY = os.environ.get("LOAD_STRATEGY", "normal")
8
  MODEL_ID = "google/gemma-4-E2B-it"
@@ -19,9 +24,12 @@ STATS = {
19
 
20
 
21
  def _profile():
 
22
  import torch
 
23
  import transformers
24
  from transformers import AutoModelForImageTextToText, AutoProcessor
 
25
 
26
  STATS["transformers_version"] = transformers.__version__
27
  STATS["torch_version"] = torch.__version__
@@ -33,11 +41,14 @@ def _profile():
33
  t_imports_done = time.time()
34
  STATS["imports_seconds"] = t_imports_done - T_START
35
 
 
36
  t0 = time.time()
37
  processor = AutoProcessor.from_pretrained(MODEL_SOURCE)
38
  t1 = time.time()
 
39
  STATS["processor_load_seconds"] = t1 - t0
40
 
 
41
  t2 = time.time()
42
  model = AutoModelForImageTextToText.from_pretrained(
43
  MODEL_SOURCE,
@@ -45,6 +56,7 @@ def _profile():
45
  device_map="auto",
46
  )
47
  t3 = time.time()
 
48
  STATS["model_load_seconds"] = t3 - t2
49
  STATS["total_load_seconds"] = t3 - t0
50
 
@@ -57,10 +69,12 @@ def _profile():
57
  return_tensors="pt",
58
  ).to(model.device)
59
 
 
60
  t4 = time.time()
61
  with torch.inference_mode():
62
  out = model.generate(**inputs, max_new_tokens=64, do_sample=False)
63
  t5 = time.time()
 
64
  STATS["predict_seconds"] = t5 - t4
65
 
66
  new_tokens = out[0][inputs["input_ids"].shape[1]:]
 
1
  import os
2
+ import sys
3
  import time
4
  import traceback
5
 
6
+ sys.stdout.reconfigure(line_buffering=True)
7
+ sys.stderr.reconfigure(line_buffering=True)
8
+
9
  T_START = time.time()
10
+ print(f"[app] __main__ start t={T_START:.2f}", flush=True)
11
 
12
  LOAD_STRATEGY = os.environ.get("LOAD_STRATEGY", "normal")
13
  MODEL_ID = "google/gemma-4-E2B-it"
 
24
 
25
 
26
  def _profile():
27
+ print("[app] profile: importing torch...", flush=True)
28
  import torch
29
+ print("[app] profile: importing transformers...", flush=True)
30
  import transformers
31
  from transformers import AutoModelForImageTextToText, AutoProcessor
32
+ print(f"[app] profile: torch={torch.__version__}, transformers={transformers.__version__}", flush=True)
33
 
34
  STATS["transformers_version"] = transformers.__version__
35
  STATS["torch_version"] = torch.__version__
 
41
  t_imports_done = time.time()
42
  STATS["imports_seconds"] = t_imports_done - T_START
43
 
44
+ print(f"[app] loading processor from {MODEL_SOURCE!r}", flush=True)
45
  t0 = time.time()
46
  processor = AutoProcessor.from_pretrained(MODEL_SOURCE)
47
  t1 = time.time()
48
+ print(f"[app] processor loaded in {t1-t0:.2f}s", flush=True)
49
  STATS["processor_load_seconds"] = t1 - t0
50
 
51
+ print(f"[app] loading model from {MODEL_SOURCE!r}", flush=True)
52
  t2 = time.time()
53
  model = AutoModelForImageTextToText.from_pretrained(
54
  MODEL_SOURCE,
 
56
  device_map="auto",
57
  )
58
  t3 = time.time()
59
+ print(f"[app] model loaded in {t3-t2:.2f}s", flush=True)
60
  STATS["model_load_seconds"] = t3 - t2
61
  STATS["total_load_seconds"] = t3 - t0
62
 
 
69
  return_tensors="pt",
70
  ).to(model.device)
71
 
72
+ print("[app] generating...", flush=True)
73
  t4 = time.time()
74
  with torch.inference_mode():
75
  out = model.generate(**inputs, max_new_tokens=64, do_sample=False)
76
  t5 = time.time()
77
+ print(f"[app] generate done in {t5-t4:.2f}s", flush=True)
78
  STATS["predict_seconds"] = t5 - t4
79
 
80
  new_tokens = out[0][inputs["input_ids"].shape[1]:]