gowitheflow commited on
Commit
38c1507
·
1 Parent(s): 7178658

Integrate with Sentence Transformers v5.4 (#2)

Browse files

- Integrate with Sentence Transformers v5.4 (b451dab4411e4b97c710e6341468409d4085f868)

1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 3584,
3
+ "pooling_mode": "lasttoken",
4
+ "include_prompt": true
5
+ }
README.md CHANGED
@@ -1,7 +1,12 @@
1
  ---
2
  license: apache-2.0
3
  pipeline_tag: feature-extraction
4
- library_name: transformers
 
 
 
 
 
5
  ---
6
 
7
  # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
@@ -19,6 +24,67 @@ This model implements the framework presented in the paper [Scaling Language-Cen
19
 
20
  Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ```python
23
  from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
24
  from qwen_omni_utils import process_mm_info
 
1
  ---
2
  license: apache-2.0
3
  pipeline_tag: feature-extraction
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - transformers
7
+ - sentence-transformers
8
+ - feature-extraction
9
+ - multimodal-embedding
10
  ---
11
 
12
  # LCO-Embedding: Scaling Language-Centric Omnimodal Representation Learning
 
24
 
25
  Note: We are only using the `thinker` component of Qwen2.5 Omni and drops the `talker` component.
26
 
27
+ ### Using Sentence Transformers
28
+
29
+ Install Sentence Transformers:
30
+ ```bash
31
+ pip install "sentence_transformers[image]"
32
+ ```
33
+
34
+ ```python
35
+ import torch
36
+ from sentence_transformers import SentenceTransformer
37
+
38
+ model = SentenceTransformer(
39
+ "LCO-Embedding/LCO-Embedding-Omni-7B",
40
+ trust_remote_code=True,
41
+ model_kwargs={"dtype": torch.bfloat16},
42
+ )
43
+
44
+ # The same "Summarize the above <modality> in one word:" instruction used in
45
+ # the paper is baked into the chat template, so encode() takes plain text or
46
+ # multimodal dicts directly.
47
+ texts = [
48
+ "The capital of France is Paris.",
49
+ "Paris is the capital city of France.",
50
+ "The Eiffel Tower is located in Paris.",
51
+ "Berlin is the capital of Germany.",
52
+ ]
53
+ text_embeddings = model.encode(texts)
54
+ print(text_embeddings.shape)
55
+ # (4, 3584)
56
+
57
+ text_similarities = model.similarity(text_embeddings, text_embeddings)
58
+ print(text_similarities)
59
+ # tensor([[1.0000, 0.9453, 0.6885, 0.5223],
60
+ # [0.9453, 1.0000, 0.7283, 0.5434],
61
+ # [0.6885, 0.7283, 1.0000, 0.3772],
62
+ # [0.5223, 0.5434, 0.3772, 1.0000]])
63
+
64
+ # Encoding images (text, audio, and video also work, individually or combined using a dict input):
65
+ image_embeddings = model.encode([
66
+ "path/to/image_1.png",
67
+ "path/to/image_2.png",
68
+ ])
69
+ print(image_embeddings.shape)
70
+ # (2, 3584)
71
+
72
+ # Multimodal inputs can mix modalities via dicts (text + image + audio + video):
73
+ queries = ["A diagram of the Qwen2.5-Omni architecture"]
74
+ documents = [
75
+ {"image": "path/to/qwen_diagram.png"},
76
+ {"text": "Llama 4 architecture overview", "image": "path/to/llama_diagram.png"},
77
+ ]
78
+ query_embeddings = model.encode(queries)
79
+ document_embeddings = model.encode(documents)
80
+
81
+ similarities = model.similarity(query_embeddings, document_embeddings)
82
+ print(similarities.shape)
83
+ # torch.Size([1, 2])
84
+ ```
85
+
86
+ ### Using Transformers
87
+
88
  ```python
89
  from transformers import Qwen2_5OmniThinkerForConditionalGeneration, Qwen2_5OmniProcessor
90
  from qwen_omni_utils import process_mm_info
additional_chat_templates/sentence_transformers.jinja ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set audio_count = namespace(value=0) -%}
2
+ {%- set image_count = namespace(value=0) -%}
3
+ {%- set video_count = namespace(value=0) -%}
4
+ <|im_start|>system
5
+ You are a helpful assistant.<|im_end|>
6
+ {% for message in messages -%}
7
+ {%- if message['role'] == 'system' -%}
8
+ {#- skip: the fixed system prompt above was already emitted. Any input system
9
+ message is only present to silence Qwen2.5-Omni's default-prompt warning. -#}
10
+ {%- else -%}
11
+ <|im_start|>{{ message['role'] }}
12
+ {% if message['content'] is string -%}
13
+ {{- message['content'] -}}<|im_end|>
14
+ {% else -%}
15
+ {%- set seen = namespace(image=false, audio=false, video=false) -%}
16
+ {%- for content in message['content'] -%}
17
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
18
+ {%- set image_count.value = image_count.value + 1 -%}
19
+ {%- set seen.image = true -%}
20
+ {%- if add_vision_id -%}Picture {{ image_count.value }}: {% endif -%}
21
+ <|vision_bos|><|IMAGE|><|vision_eos|>
22
+ {%- elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content -%}
23
+ {%- set audio_count.value = audio_count.value + 1 -%}
24
+ {%- set seen.audio = true -%}
25
+ {%- if add_audio_id -%}Audio {{ audio_count.value }}: {% endif -%}
26
+ <|audio_bos|><|AUDIO|><|audio_eos|>
27
+ {%- elif content['type'] == 'video' or 'video' in content -%}
28
+ {%- set video_count.value = video_count.value + 1 -%}
29
+ {%- set seen.video = true -%}
30
+ {%- if add_vision_id -%}Video {{ video_count.value }}: {% endif -%}
31
+ <|vision_bos|><|VIDEO|><|vision_eos|>
32
+ {%- elif 'text' in content -%}
33
+ {{- content['text'] -}}
34
+ {%- endif -%}
35
+ {%- endfor -%}
36
+ {%- if seen.image -%}
37
+ {{ '\n' }}Summarize the above image in one word:
38
+ {%- elif seen.video -%}
39
+ {{ '\n' }}Summarize the above video in one word:
40
+ {%- elif seen.audio -%}
41
+ {{ '\n' }}Summarize the above audio in one word:
42
+ {%- else -%}
43
+ {{ '\n' }}Summarize the above text in one word:
44
+ {%- endif -%}
45
+ <|im_end|>
46
+ {% endif -%}
47
+ {%- endif -%}
48
+ {%- endfor -%}
49
+ {%- if add_generation_prompt -%}
50
+ <|im_start|>assistant
51
+ {% endif -%}
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
chat_template.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
- }
 
 
 
 
config.json CHANGED
@@ -3,6 +3,10 @@
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
 
 
 
 
6
  "audio_config": {
7
  "_attn_implementation_autoset": true,
8
  "activation_dropout": 0.0,
 
3
  "architectures": [
4
  "Qwen2_5OmniThinkerForConditionalGeneration"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "modeling_lco_omni.Qwen2_5OmniThinkerConfig",
8
+ "AutoModel": "modeling_lco_omni.Qwen2_5OmniThinkerForConditionalGeneration"
9
+ },
10
  "audio_config": {
11
  "_attn_implementation_autoset": true,
12
  "activation_dropout": 0.0,
config_sentence_transformers.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.4.0",
5
+ "transformers": "5.5.0.dev0"
6
+ },
7
+ "default_prompt_name": "default",
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {
10
+ "default": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
11
+ },
12
+ "similarity_fn_name": "cosine"
13
+ }
modeling_lco_omni.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Re-exported so `auto_map` in config.json can resolve the Thinker classes;
2
+ # `qwen2_5_omni_thinker` is shipped by transformers but not in `AutoConfig`.
3
+ from transformers import Qwen2_5OmniThinkerConfig, Qwen2_5OmniThinkerForConditionalGeneration
4
+
5
+ __all__ = [
6
+ "Qwen2_5OmniThinkerConfig",
7
+ "Qwen2_5OmniThinkerForConditionalGeneration",
8
+ ]
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": [
7
+ "hidden_states",
8
+ -1
9
+ ]
10
+ },
11
+ "image": {
12
+ "method": "forward",
13
+ "method_output_name": [
14
+ "hidden_states",
15
+ -1
16
+ ]
17
+ },
18
+ "audio": {
19
+ "method": "forward",
20
+ "method_output_name": [
21
+ "hidden_states",
22
+ -1
23
+ ]
24
+ },
25
+ "video": {
26
+ "method": "forward",
27
+ "method_output_name": [
28
+ "hidden_states",
29
+ -1
30
+ ]
31
+ },
32
+ "message": {
33
+ "method": "forward",
34
+ "method_output_name": [
35
+ "hidden_states",
36
+ -1
37
+ ],
38
+ "format": "structured"
39
+ }
40
+ },
41
+ "module_output_name": "token_embeddings",
42
+ "processing_kwargs": {
43
+ "chat_template": {
44
+ "chat_template": "sentence_transformers",
45
+ "add_generation_prompt": true
46
+ }
47
+ }
48
+ }