kristaller486 commited on 24 days ago

Commit

5f7c324

verified ·

1 Parent(s): 0007dca

reupload from modelscope cn

Browse files

Files changed (20) hide show

.gitattributes +53 -35
NOTICE +339 -0
README.md +735 -0
chat_template.json +3 -0
config.json +51 -0
configuration.json +1 -0
configuration_dots.py +78 -0
dots.ocr-1.5 LICENSE AGREEMENT +109 -0
generation_config.json +8 -0
merges.txt +3 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +651 -0
modeling_dots_ocr.py +131 -0
modeling_dots_vision.py +404 -0
preprocessor_config.json +22 -0
special_tokens_map.json +25 -0
tokenizer.json +3 -0
tokenizer_config.json +391 -0
vocab.json +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,53 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+vocab.json filter=lfs diff=lfs merge=lfs -text
+model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
+merges.txt filter=lfs diff=lfs merge=lfs -text

NOTICE ADDED Viewed

	@@ -0,0 +1,339 @@

+==================================================================
+=============== Copyright Notice and License Texts ===============
+==================================================================
+------------- LICENSE FOR gradio CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+    "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+    "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+    "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+    "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+    "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+    "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+    "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+    "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+   "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+    "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+    (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
+    (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
+    (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+    (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+    You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions.Not withstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+    To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+------------- LICENSE FOR gradio_image_annotation CODE --------------
+Copyright notice：Copyright (c) 2024 Edgar Gracia
+License :MIT
+MIT License
+Copyright (c) 2024 Edgar Gracia
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------- LICENSE FOR openai CODE --------------
+Copyright notice:Copyright 2025 OpenAI
+License:apache2.0
+Please see above.
+------------- LICENSE FOR qwen_vl_utils CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR transformers CODE --------------
+Copyright notice:Copyright 2018- The Hugging Face team. All rights reserved.
+License:apache2.0
+Please see above.
+------------- LICENSE FOR huggingface_hub CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR flash-attn CODE --------------
+Copyright notice:Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
+License:BSD-3-Clause license
+BSD 3-Clause License
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this list ofconditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISEDOF THE POSSIBILITY OF SUCH DAMAGE.
+------------- LICENSE FOR accelerate CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR MonkeyOCR CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR OmniDocbench CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR Qwen2.5-VL CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR aimv2 CODE --------------
+Copyright notice: Copyright (C) 2024 Apple Inc. All Rights Reserved.
+License：
+IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple") in consideration of your agreement to the following terms, and your use, installation, modification or redistribution of this Apple software constitutes acceptance of these terms.  If you do not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and subject to these terms, Apple grants you a personal, non-exclusive license, under Apple's copyrights in this original Apple software (the "Apple Software"), to use, reproduce, modify and redistribute the Apple Software, with or without modifications, in source and/or binary forms; provided that if you redistribute the Apple Software in its entirety and without modifications, you must retain this notice and the following text and disclaimers in all such redistributions of the Apple Software. Neither the name, trademarks, service marks or logos of Apple Inc. May be used to endorse or promote products derived from the Apple Software without specific prior written permission from Apple.  Except as expressly stated in this notice, no other rights or licenses, express or implied, are granted by Apple herein, including but not limited to any patent rights that may be infringed by your derivative works or by other works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+SOFTWARE DISTRIBUTED WITH AUTOREGRESSIVE IMAGE MODELS:
+The Autoregressive Image Models software includes a number of subcomponents with
+separate  copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+Acknowledgements：
+Portions of the Autoregressive Image Models project may utilize the following copyrighted material, the use of which is hereby acknowledged.
+------------- LICENSE FOR Hugging Face CODE --------------
+Copyright notice：Copyright 2019 Ross Wightman
+License:apache2.0
+Please see above.
+------------- LICENSE FOR vLLM CODE --------------
+Copyright notice:No copyright info provided
+License:apache2.0
+Please see above.
+------------- LICENSE FOR Doclaynet --------------
+Copyright notice:No copyright info provided
+License:Community Data License Agreement
+Community Data License Agreement – Permissive – Version 1.0
+This is the Community Data License Agreement – Permissive, Version 1.0 (“Agreement”). Data is provided to You under this Agreement by each of the Data Providers. Your exercise of any of the rights and permissions granted below constitutes your acceptance and agreement to be bound by the terms and conditions of this Agreement.
+The benefits that each Data Provider receives from making Data available and that You receive from Data or otherwise under these terms and conditions shall be deemed sufficient consideration for the formation of this Agreement. Accordingly, Data Provider(s) and You (the "Parties") agree as follows:
+Section 1.  Definitions
+1.1 "Add" means to supplement Data with Your own or someone else's Data, resulting in Your “Additions.” Additions do not include Results.
+1.2 "Computational Use" means Your analysis (through the use of computational devices or otherwise) or other interpretation of Data. By way of example and not limitation, "Computational Use" includes the application of any computational analytical technique, the purpose of which is the analysis of any Data in digital form to generate information about Data such as patterns, trends, correlations, inferences, insights and attributes.
+1.3 "Data" means the information (including copyrightable information, such as images or text), collectively or individually, whether created or gathered by a Data Provider or an Entity acting on its behalf, to which rights are granted under this Agreement.
+1.4 "Data Provider" means any Entity (including any employee or contractor of such Entity authorized to Publish Data on behalf of such Entity) that Publishes Data under this Agreement prior to Your Receiving it.
+1.5 "Enhanced Data" means the subset of Data that You Publish and that is composed of (a) Your Additions and/or (b) Modifications to Data You have received under this Agreement.
+1.6 "Entity" means any natural person or organization that exists under the laws of the jurisdiction in which it is organized, together with all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, directly or indirectly, to cause the direction or management of such entity, whether by contract or otherwise, (b) the ownership of more than fifty percent (50%) of the outstanding shares or securities, (c) the beneficial ownership of such entity or, (d) the ability to appoint, whether by agreement or right, the majority of directors of an Entity.
+1.7 "Modify" means to delete, erase, correct or re-arrange Data, resulting in “Modifications.” Modifications do not include Results.
+1.8 "Publish" means to make all or a subset of Data (including Your Enhanced Data) available in any manner which enables its use, including by providing a copy on physical media or remote access. For any form of Entity, that is to make the Data available to any individual who is not employed by that Entity or engaged as a contractor or agent to perform work on that Entity's behalf. A "Publication" occurs each time you Publish Data.
+1.9 "Receive" or "Receives" means to have been given access to Data, locally or remotely.
+1.10 "Results" means the outcomes or outputs that You obtain from Your Computational Use of Data. Results shall not include more than a de minimis portion of the Data on which the Computational Use is based.
+1.11 "Sui Generis Database Rights" means rights, other than copyright, resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other equivalent rights anywhere in the world.
+1.12 "Use" means using Data (including accessing, copying, studying, reviewing, adapting, analyzing, evaluating, or making Computational Use of it), either by machines or humans, or a combination of both.
+1.13 "You" or "Your" means any Entity that Receives Data under this Agreement.
+Section 2. Right and License to Use and to Publish
+2.1 Subject to the conditions set forth in Section 3 of this Agreement, Data Provider(s) hereby grant(s) to You a worldwide, non-exclusive, irrevocable (except as provided in Section 5) right to: (a) Use Data; and (b) Publish Data.
+2.2 To the extent that the Data or the coordination, selection or arrangement of Data is protected or protectable under copyright, Sui Generis Database Rights, or other law, Data Provider(s) further agree(s) that such Data or coordination, selection or arrangement is hereby licensed to You and to anyone else who Receives Data under this Agreement for Use and Publication, subject to the conditions set forth in Section 3 of this Agreement.
+2.3 Except for these rights and licenses expressly granted, no other intellectual property rights are granted or should be implied.
+Section 3. Conditions on Rights Granted
+3.1 If You Publish Data You Receive or Enhanced Data:
+(a) You may do so under a license of your choice provided that you give anyone who receives the data from you the text of this Agreement, the name of this Agreement and/or a hyperlink or other method reasonably likely to provide a copy of the text of this Agreement; and
+(b) You must cause any Data files containing Enhanced Data to carry prominent notices that you have changed those files; and
+(c) If You Publish Data You Receive, You must preserve all credit or attribution to the Data Provider(s). Such retained credit or attribution includes any of the following to the extent they exist in the Data as You have Received it: legal notices or metadata; identification of the Data Provider(s); or hyperlinks to Data to the extent it is practical to do so.
+3.2 You may provide additional or different license terms and conditions for use, reproduction, or distribution of that Enhanced Data, or for any combination of Data and Enhanced Data as a whole, provided that Your Use and Publication of that combined Data otherwise complies with the conditions stated in this License.
+3.3 You and each Data Provider agree that Enhanced Data shall not be considered a work of joint authorship by virtue of its relationship to Data licensed under this Agreement and shall not require either any obligation of accounting to or the consent of any Data Provider.
+3.4 This Agreement imposes no obligations or restrictions on Your Use or Publication of Results.
+Section 4. Data Provider(s)' Representations
+4.1 Each Data Provider represents that the Data Provider has exercised reasonable care, to assure that: (a) the Data it Publishes was created or generated by it or was obtained from others with the right to Publish the Data under this Agreement; and (b) Publication of such Data does not violate any privacy or confidentiality obligation undertaken by the Data Provider.
+Section 5.  Termination
+5.1 All of Your rights under this Agreement will terminate, and Your right to Receive, Use or Publish the Data will be revoked or modified if You materially fail to comply with the terms and conditions of this Agreement and You do not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If your rights under this Agreement terminate, you agree to cease Receipt, Use and Publication of Data. However, your obligations and any rights and permissions granted by you under this Agreement relating to Data that you published prior to such termination will continue and survive.
+5.2 If you institute litigation against a Data Provider or anyone else who Receives the Data (including a cross-claim in a lawsuit) based on the Data, other than a claim asserting breach of this Agreement, then any rights previously granted to You to Receive, Use and Publish Data under this Agreement will terminate as of the date such litigation is filed.
+Section 6. Disclaimer of Warranties and Limitation of Liability
+6.1 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE DATA (INCLUDING ENHANCED DATA) IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+6.2 NEITHER YOU NOR ANY DATA PROVIDERS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE DATA OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+Section 7.  Miscellaneous
+7.1 You agree that it is solely your responsibility to comply with all applicable laws with regard to Your Use or Publication of Data, including any applicable privacy, data protection, security and export laws. You agree to take reasonable steps to assist a Data Provider fulfilling responsibilities to comply with applicable laws with regard to Use or Publication of Data Received hereunder.
+7.2 You and Data Provider(s), collectively and individually, waive and/or agree not to assert, to the extent permitted by law, any moral rights you or they hold in Data.
+7.3 This Agreement confers no rights or remedies upon any person or entity other than the Parties and their respective heirs, executors, successors and assigns.
+7.4 The Data Provider(s) reserve no right or expectation of privacy, data protection or confidentiality in any Data that they Publish under this Agreement. If you choose to Publish Data under this Agreement, you similarly do so with no reservation or expectation of any rights of privacy or confidentiality in that Data.
+7.5 The Community Data License Agreement workgroup under The Linux Foundation is the steward of this Agreement (“Steward”). No one other than the Steward has the right to modify or publish new versions of this Agreement. Each version will be given a distinguishing version number. You may Use and Publish Data Received hereunder under the terms of the version of the Agreement under which You originally Received the Data, or under the terms of any subsequent version published by the Steward.

README.md CHANGED Viewed

@@ -1,3 +1,738 @@
 ---
 license: mit
 ---

 ---
 license: mit
+library_name: dots_ocr_1_5
+pipeline_tag: image-text-to-text
+tags:
+- image-to-text
+- ocr
+- document-parse
+- layout
+- table
+- formula
+- transformers
+- custom_code
+language:
+- en
+- zh
+- multilingual
 ---
+<div align="center">
+<p align="center">
+    <img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/logo.png" width="300"/>
+<p>
+<h1 align="center">
+dots.ocr-1.5: Recognize Any Human Scripts and Symbols
+</h1>
+[![HuggingFace](https://img.shields.io/badge/HuggingFace%20Weights-black.svg?logo=HuggingFace)](https://huggingface.co/rednote-hilab/dots.ocr-1.5)
+[![GitHub](https://img.shields.io/badge/GitHub-Repository-black?logo=github)](https://github.com/rednote-hilab/dots.ocr)
+<div align="center">
+  <a href="https://dotsocr.xiaohongshu.com" target="_blank" rel="noopener noreferrer"><strong>🖥️ Live Demo</strong></a> |
+  <a href="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/wechat.jpg" target="_blank" rel="noopener noreferrer"><strong>💬 WeChat</strong></a> |
+  <a href="https://www.xiaohongshu.com/user/profile/683ffe42000000001d021a4c" target="_blank" rel="noopener noreferrer"><strong>📕 rednote</strong></a>
+</div>
+</div>
+## Introduction
+We present **dots.ocr-1.5**, a 3B-parameter multimodal model composed of a 1.2B vision encoder and a 1.7B language model. Designed for universal accessibility, it possesses the capability to recognize virtually any human script. Beyond achieving state-of-the-art (SOTA) performance in standard multilingual document parsing among models of comparable size, dots.ocr-1.5 excels at converting structured graphics (e.g., charts and diagrams) directly into SVG code, parsing web screens and spotting scene text. Furthermore, the model demonstrates competitive performance in general OCR, object grounding & counting tasks.
+1. **Stronger Document Parsing Performance:** dots.ocr-1.5 maintains SOTA performance among latest OCR models, particularly on **multilingual documents**. Addressing the significant bias inherent in the detection & matching rules of certain benchmarks —which often fail to accurately reflect a model's true capabilities—we adopted an **Elo score** evaluation system. Under this metric, the performance landscape shifts significantly, highlighting the superior robustness of our model compared to conventional rankings.
+2. **Unified Vision-Language Parsing**: Visual languages (e.g., charts, graphics, chemical formulas, logos) encapsulate dense human knowledge, akin to natural language. dots.ocr-1.5 unifies the interpretation of these elements by parsing them directly into SVG code. We have validated the effectiveness of this approach, demonstrating impressive results in structural and semantic recognition.
+3. **Broader and More General Capabilities**: Compared to dots.ocr, dots.ocr-1.5 supports a significantly wider array of tasks. It extends beyond standard OCR to handle web screen parsing, scene text spotting, object grounding & counting, and other general OCR QA tasks.
+## Evaluation
+### 1. Document Parsing
+#### 1.1 Elo Score of different bench between latest models
+<table>
+  <thead>
+    <tr>
+      <th>models</th>
+      <th>olmOCR-Bench</th>
+      <th>OmniDocBench (v1.5)</th>
+      <th>XDocParse</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>GLM-OCR</td>
+      <td>859.9</td>
+      <td>937.5</td>
+      <td>742.1</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL-1.5</td>
+      <td>873.6</td>
+      <td>965.6</td>
+      <td>797.6</td>
+    </tr>
+    <tr>
+      <td>HuanyuanOCR</td>
+      <td>978.9</td>
+      <td>974.4</td>
+      <td>895.9</td>
+    </tr>
+    <tr>
+      <td>dots.ocr</td>
+      <td>1027.4</td>
+      <td>994.7</td>
+      <td>1133.4</td>
+    </tr>
+    <!-- Highlighting dots.ocr-1.5 row with bold tags -->
+    <tr>
+      <td><strong>dots.ocr-1.5</strong></td>
+      <td><strong>1089.0</strong></td>
+      <td><strong>1025.8</strong></td>
+      <td><strong>1157.1</strong></td>
+    </tr>
+    <tr>
+      <td>Gemini 3 Pro</td>
+      <td>1171.2</td>
+      <td>1102.1</td>
+      <td>1273.9</td>
+    </tr>
+  </tbody>
+</table>
+> **Notes:**
+> - Results for Gemini 3 Pro, PaddleOCR-VL-1.5, and GLM-OCR were obtained via APIs, while HuanyuanOCR results were generated using local inference.
+> - The Elo score evaluation was conducted using Gemini 3 Flash. The prompt can be found at: [Elo Score Prompt](https://github.com/rednote-hilab/dots.ocr/blob/master/tools/elo_score_prompt.py). These results are consistent with the findings on [ocrarena](https://www.ocrarena.ai/battle).
+#### 1.2 olmOCR-bench
+<table>
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th>ArXiv</th>
+            <th>Old scans math</th>
+            <th>Tables</th>
+            <th>Old scans</th>
+            <th>Headers & footers</th>
+            <th>Multi column</th>
+            <th>Long tiny text</th>
+            <th>Base</th>
+            <th>Overall</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>Mistral OCR API</td>
+            <td>77.2</td>
+            <td>67.5</td>
+            <td>60.6</td>
+            <td>29.3</td>
+            <td>93.6</td>
+            <td>71.3</td>
+            <td>77.1</td>
+            <td>99.4</td>
+            <td>72.0±1.1</td>
+        </tr>
+        <tr>
+            <td>Marker 1.10.1</td>
+            <td>83.8</td>
+            <td>66.8</td>
+            <td>72.9</td>
+            <td>33.5</td>
+            <td>86.6</td>
+            <td>80.0</td>
+            <td>85.7</td>
+            <td>99.3</td>
+            <td>76.1±1.1</td>
+        </tr>
+        <tr>
+            <td>MinerU 2.5.4*</td>
+            <td>76.6</td>
+            <td>54.6</td>
+            <td>84.9</td>
+            <td>33.7</td>
+            <td>96.6</td>
+            <td>78.2</td>
+            <td>83.5</td>
+            <td>93.7</td>
+            <td>75.2±1.1</td>
+        </tr>
+        <tr>
+            <td>DeepSeek-OCR</td>
+            <td>77.2</td>
+            <td>73.6</td>
+            <td>80.2</td>
+            <td>33.3</td>
+            <td>96.1</td>
+            <td>66.4</td>
+            <td>79.4</td>
+            <td>99.8</td>
+            <td>75.7±1.0</td>
+        </tr>
+        <tr>
+            <td>Nanonets-OCR2-3B</td>
+            <td>75.4</td>
+            <td>46.1</td>
+            <td>86.8</td>
+            <td>40.9</td>
+            <td>32.1</td>
+            <td>81.9</td>
+            <td>93.0</td>
+            <td>99.6</td>
+            <td>69.5±1.1</td>
+        </tr>
+        <tr>
+            <td>PaddleOCR-VL*</td>
+            <td>85.7</td>
+            <td>71.0</td>
+            <td>84.1</td>
+            <td>37.8</td>
+            <td>97.0</td>
+            <td>79.9</td>
+            <td>85.7</td>
+            <td>98.5</td>
+            <td>80.0±1.0</td>
+        </tr>
+        <tr>
+            <td>Infinity-Parser 7B*</td>
+            <td>84.4</td>
+            <td>83.8</td>
+            <td>85.0</td>
+            <td>47.9</td>
+            <td>88.7</td>
+            <td>84.2</td>
+            <td>86.4</td>
+            <td>99.8</td>
+            <td>82.5±?</td>
+        </tr>
+        <tr>
+            <td>olmOCR v0.4.0</td>
+            <td>83.0</td>
+            <td>82.3</td>
+            <td>84.9</td>
+            <td>47.7</td>
+            <td>96.1</td>
+            <td>83.7</td>
+            <td>81.9</td>
+            <td>99.7</td>
+            <td>82.4±1.1</td>
+        </tr>
+        <tr>
+            <td>Chandra OCR 0.1.0*</td>
+            <td>82.2</td>
+            <td>80.3</td>
+            <td>88.0</td>
+            <td>50.4</td>
+            <td>90.8</td>
+            <td>81.2</td>
+            <td>92.3</td>
+            <td>99.9</td>
+            <td>83.1±0.9</td>
+        </tr>
+        <tr>
+            <td>dots.ocr</td>
+            <td>82.1</td>
+            <td>64.2</td>
+            <td>88.3</td>
+            <td>40.9</td>
+            <td>94.1</td>
+            <td>82.4</td>
+            <td>81.2</td>
+            <td>99.5</td>
+            <td>79.1±1.0</td>
+        </tr>
+        <tr>
+            <td><strong>dots.ocr-1.5</strong></td>
+            <td><strong>85.9</strong></td>
+            <td><strong>85.5</strong></td>
+            <td><strong>90.7</strong></td>
+            <td>48.2</td>
+            <td>94.0</td>
+            <td><strong>85.3</strong></td>
+            <td>81.6</td>
+            <td>99.7</td>
+            <td><strong>83.9±0.9</strong></td>
+        </tr>
+    </tbody>
+</table>
+> **Note:**
+> - The metrics are from [olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+#### 1.3 Other Benchmarks
+<table>
+  <thead>
+    <tr>
+      <th>Model Type</th>
+      <th>Methods</th>
+      <th>Size</th>
+      <th>OmniDocBench(v1.5)<br>TextEdit↓</th>
+      <th>OmniDocBench(v1.5)<br>Read OrderEdit↓</th>
+      <th>pdf-parse-bench</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- GeneralVLMs Group (Reversed Order, 3 rows) -->
+    <tr>
+      <td rowspan="3"><strong>GeneralVLMs</strong></td>
+      <td>Gemini-2.5 Pro</td>
+      <td>-</td>
+      <td>0.075</td>
+      <td>0.097</td>
+      <td>9.06</td>
+    </tr>
+    <tr>
+      <td>Qwen3-VL-235B-A22B-Instruct</td>
+      <td>235B</td>
+      <td>0.069</td>
+      <td>0.068</td>
+      <td><strong>9.71</strong></td>
+    </tr>
+    <tr>
+      <td>gemini3pro</td>
+      <td>-</td>
+      <td>0.066</td>
+      <td>0.079</td>
+      <td>9.68</td>
+    </tr>
+    <!-- SpecializedVLMs Group (Reversed Order, 12 rows) -->
+    <tr>
+      <td rowspan="12"><strong>SpecializedVLMs</strong></td>
+      <td>Mistral OCR</td>
+      <td>-</td>
+      <td>0.164</td>
+      <td>0.144</td>
+      <td>8.84</td>
+    </tr>
+    <tr>
+      <td>Deepseek-OCR</td>
+      <td>3B</td>
+      <td>0.073</td>
+      <td>0.086</td>
+      <td>8.26</td>
+    </tr>
+    <tr>
+      <td>MonkeyOCR-3B</td>
+      <td>3B</td>
+      <td>0.075</td>
+      <td>0.129</td>
+      <td>9.27</td>
+    </tr>
+    <tr>
+      <td>OCRVerse</td>
+      <td>4B</td>
+      <td>0.058</td>
+      <td>0.071</td>
+      <td>--</td>
+    </tr>
+    <tr>
+      <td>MonkeyOCR-pro-3B</td>
+      <td>3B</td>
+      <td>0.075</td>
+      <td>0.128</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>MinerU2.5</td>
+      <td>1.2B</td>
+      <td>0.047</td>
+      <td>0.044</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL</td>
+      <td>0.9B</td>
+      <td>0.035</td>
+      <td>0.043</td>
+      <td>9.51</td>
+    </tr>
+    <tr>
+      <td>HunyuanOCR</td>
+      <td>0.9B</td>
+      <td>0.042</td>
+      <td>-</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL1.5</td>
+      <td>0.9B</td>
+      <td>0.035</td>
+      <td>0.042</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>GLMOCR</td>
+      <td>0.9B</td>
+      <td>0.04</td>
+      <td>0.043</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>dots.ocr</td>
+      <td>3B</td>
+      <td>0.048</td>
+      <td>0.053</td>
+      <td>9.29</td>
+    </tr>
+    <tr>
+      <td><u><strong>dots.ocr-1.5</strong></u></td>
+      <td>3B</td>
+      <td><strong>0.031</strong></td>
+      <td><strong>0.029</strong></td>
+      <td>9.54</td>
+    </tr>
+  </tbody>
+</table>
+> **Note:**
+> - Metrics are sourced from [OmniDocBench](https://github.com/opendatalab/OmniDocBench) and other model publications. [pdf-parse-bench](https://github.com/phorn1/pdf-parse-bench) results are reproduced by Qwen3-VL-235B-A22B-Instruct.
+> - Formula and Table metrics for OmniDocBench1.5 are omitted due to their high sensitivity to detection and matching protocols.
+### 2. Vision-Language Parsing
+Visual languages (e.g., charts, graphics, chemical formulas, logos) encapsulate dense human knowledge. **dots.ocr-1.5** unifies the interpretation of these elements by parsing them directly into **SVG code**.
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" style="text-align: left;">Methods</th>
+      <th colspan="3">Unisvg</th>
+      <th rowspan="2">Chartmimic</th>
+      <th rowspan="2">Design2Code</th>
+      <th rowspan="2">Genexam</th>
+      <th rowspan="2">SciGen</th>
+      <th rowspan="2">ChemDraw</th>
+    </tr>
+    <tr>
+      <th>Low-Level</th>
+      <th>High-Level</th>
+      <th>Score</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left;">OCRVerse</td>
+      <td>0.632</td>
+      <td>0.852</td>
+      <td>0.763</td>
+      <td>0.799</td>
+      <td>-</td>
+      <td>-</td>
+      <td>-</td>
+      <td>0.881</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;">Gemini 3 Pro</td>
+      <td>0.563</td>
+      <td>0.850</td>
+      <td>0.735</td>
+      <td>0.788</td>
+      <td>0.760</td>
+      <td>0.756</td>
+      <td>0.783</td>
+      <td>0.839</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;">dots.ocr-1.5</td>
+      <td>0.850</td>
+      <td>0.923</td>
+      <td>0.894</td>
+      <td>0.772</td>
+      <td>0.801</td>
+      <td>0.664</td>
+      <td>0.660</td>
+      <td>0.790</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;"><strong>dots.ocr-1.5-svg</strong></td>
+      <td><strong>0.860</strong></td>
+      <td><strong>0.931</strong></td>
+      <td><strong>0.902</strong></td>
+      <td><strong>0.905</strong></td>
+      <td><strong>0.834</strong></td>
+      <td><strong>0.8</strong></td>
+      <td><strong>0.797</strong></td>
+      <td><strong>0.901</strong></td>
+    </tr>
+  </tbody>
+</table>
+> **Note:**
+> - We use the ISVGEN metric from [UniSVG](https://ryanlijinke.github.io/) to evaluate the parsing result. For benchmarks that do not natively support image parsing, we use the original images as input, and calculate the ISVGEN score between the rendered output and the original image.
+> - [OCRVerse](https://github.com/DocTron-hub/OCRVerse) results are derived from various code formats (e.g., SVG, Python), whereas results for Gemini 3 Pro and dots.ocr-1.5 are based specifically on SVG code.
+> - Due to the capacity constraints of a 3B-parameter VLM, dots.ocr-1.5 may not excel in all tasks yet like svg. To complement this, we are simultaneously releasing dots.ocr-1.5-svg. We plan to further address these limitations in future updates.
+### 3. General Vision Tasks
+<table>
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th>CharXiv_descriptive</th>
+            <th>CharXiv_reasoning</th>
+            <th>OCR_Reasoning</th>
+            <th>infovqa</th>
+            <th>docvqa</th>
+            <th>ChartQA</th>
+            <th>OCRBench</th>
+            <th>AI2D</th>
+            <th>CountBenchQA</th>
+            <th>refcoco</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>Qwen3vl-2b-instruct</td>
+            <td>62.3</td>
+            <td>26.8</td>
+            <td>-</td>
+            <td>72.4</td>
+            <td>93.3</td>
+            <td>-</td>
+            <td>85.8</td>
+            <td>76.9</td>
+            <td>88.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td><strong>dots.ocr-1.5</strong></td>
+            <td>77.4</td>
+            <td>55.3</td>
+            <td>22.85</td>
+            <td>73.76</td>
+            <td>91.85</td>
+            <td>83.2</td>
+            <td>86.0</td>
+            <td>82.16</td>
+            <td>94.46</td>
+            <td>80.03</td>
+        </tr>
+    </tbody>
+</table>
+# Quick Start
+## 1. Installation
+### Install dots.ocr-1.5
+```shell
+conda create -n dots_ocr python=3.12
+conda activate dots_ocr
+git clone https://github.com/rednote-hilab/dots.ocr.git
+cd dots.ocr
+# Install pytorch, see https://pytorch.org/get-started/previous-versions/ for your cuda version
+pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+pip install -e .
+```
+If you have trouble with the installation, try our [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) for an easier setup, and follow these steps:
+```shell
+git clone https://github.com/rednote-hilab/dots.ocr.git
+cd dots.ocr
+pip install -e .
+```
+### Download Model Weights
+> 💡**Note:** Please use a directory name without periods (e.g., `DotsOCR_1_5` instead of `dots.ocr-1.5`) for the model save path. This is a temporary workaround pending our integration with Transformers.
+```shell
+python3 tools/download_model.py
+```
+## 2. Deployment
+### vLLM inference
+We highly recommend using vllm for deployment and inference.
+```shell
+# launch vllm server
+## dots.ocr-1.5
+CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.ocr-1.5 --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
+## dots.ocr-1.5-svg
+CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.ocr-1.5-svg --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
+# vllm api demo
+## document parsing
+python3 ./demo/demo_vllm.py --prompt_mode prompt_layout_all_en
+## web parsing
+python3 ./demo/demo_vllm.py --prompt_mode prompt_web_parsing --image_path ./assets/showcase_dots_ocr_1_5/origin/webpage_1.png
+## scene spoting
+python3 ./demo/demo_vllm.py --prompt_mode prompt_scene_spotting --image_path ./assets/showcase_dots_ocr_1_5/origin/scene_1.jpg
+## image parsing with svg code
+python3 ./demo/demo_vllm_svg.py --prompt_mode prompt_image_to_svg
+## general qa
+python3 ./demo/demo_vllm_general.py
+```
+### Hugginface inference
+```shell
+python3 demo/demo_hf.py
+```
+<details>
+<summary><b>Hugginface inference details</b></summary>
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from dots_ocr.utils import dict_promptmode_to_prompt
+model_path = "./weights/DotsOCR_1_5"
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+image_path = "demo/demo_image1.jpg"
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+5. Final Output: The entire output must be a single JSON object.
+"""
+messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image_path
+                },
+                {"type": "text", "text": prompt}
+            ]
+        }
+    ]
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=24000)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+</details>
+## 3. Document Parse
+**Based on vLLM server**, you can parse an image or a pdf file using the following commands:
+```bash
+# Parse all layout info, both detection and recognition
+# Parse a single image
+python3 dots_ocr/parser.py demo/demo_image1.jpg
+# Parse a single PDF
+python3 dots_ocr/parser.py demo/demo_pdf1.pdf  --num_thread 64  # try bigger num_threads for pdf with a large number of pages
+# Layout detection only
+python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_layout_only_en
+# Parse text only, except Page-header and Page-footer
+python3 dots_ocr/parser.py demo/demo_image1.jpg --prompt prompt_ocr
+```
+<details>
+<summary><b>Output Results</b></summary>
+1.  **Structured Layout Data** (`demo_image1.json`): A JSON file containing the detected layout elements, including their bounding boxes, categories, and extracted text.
+2.  **Processed Markdown File** (`demo_image1.md`): A Markdown file generated from the concatenated text of all detected cells.
+    *   An additional version, `demo_image1_nohf.md`, is also provided, which excludes page headers and footers for compatibility with benchmarks like Omnidocbench and olmOCR-bench.
+3.  **Layout Visualization** (`demo_image1.jpg`): The original image with the detected layout bounding boxes drawn on it.
+</details>
+## 4. Demo
+Have fun with the [live demo](https://dotsocr.xiaohongshu.com/).
+### Examples for document parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/formula1.png" alt="formula1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/table3.png" alt="table3.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/Tibetan.png" alt="Tibetan.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/tradition_zh.png" alt="tradition_zh.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/nl.png" alt="nl.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/kannada.png" alt="kannada.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase/russian.png" alt="russian.png" border="0" />
+### Examples for image parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/svg_1.png" alt="svg_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/svg_2.png" alt="svg_2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/svg_4.png" alt="svg_4.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/svg_5.png" alt="svg_5.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/svg_6.png" alt="svg_6.png" border="0" />
+> **Note:**
+> - Inferenced by dots.ocr-1.5-svg
+### Example for web parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/webpage_1.png" alt="webpage_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/webpage_2.png" alt="webpage_2.png" border="0" />
+### Examples for scene spotting
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/scene_1.png" alt="scene_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/showcase_dots_ocr_1_5/result/scene_2.png" alt="scene_2.png" border="0" />
+## Limitation & Future Work
+- **Complex Document Elements:**
+  - **Table&Formula**: The extraction of complex tables and mathematical formulas persists as a difficult task given the model's compact architecture.
+  - **Picture**: We have adopted an SVG code representation for parsing structured graphics; however, the performance has yet to achieve the desired level of robustness.
+- **Parsing Failures:** While we have reduced the rate of parsing failures compared to the previous version, these issues may still occur occasionally. We remain committed to further resolving these edge cases in future updates.

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for m in messages %}{%- if m.role == 'system' %}{{- '<|system|>' + m.content + '<|endofsystem|>\n' }}{%- elif m.role == 'user' %}{% if m.content is string %}{{- '<|user|>' + m.content + '<|endofuser|>' }}{% else %} {% for content in m.content %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|img|><|imgpad|><|endofimg|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|img|><|video_pad|><|endofimg|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{%- endif %}{%- elif m.role == 'assistant' %}{{- '<|assistant|>' + m.content }}{%- if not loop.last %}{{- '<|endofassistant|>' }}{%- endif %}{%- endif %}{%- endfor %}{%- if messages[-1].role != 'assistant' %}{{- '<|assistant|>' }}{%- endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+    "architectures": [
+        "DotsOCRForCausalLM"
+    ],
+    "model_type": "dots_ocr",
+    "auto_map": {
+        "AutoConfig": "configuration_dots.DotsOCRConfig",
+        "AutoModelForCausalLM": "modeling_dots_ocr.DotsOCRForCausalLM"
+        },
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 8960,
+    "max_position_embeddings": 131072,
+    "max_window_layers": 28,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": 131072,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936,
+    "image_token_id": 151665,
+    "video_token_id": 151656,
+    "vision_config": {
+        "embed_dim": 1536,
+        "hidden_size": 1536,
+        "intermediate_size": 4224,
+        "num_hidden_layers": 42,
+        "num_attention_heads": 12,
+        "num_channels": 3,
+        "patch_size": 14,
+        "post_norm": true,
+        "rms_norm_eps": 1e-05,
+        "spatial_merge_size": 2,
+        "temporal_patch_size": 1,
+        "use_bias": false,
+        "attn_implementation": "flash_attention_2",
+        "init_merger_std": 0.02,
+        "initializer_range": 0.02,
+        "is_causal": false
+    }
+}

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true}

configuration_dots.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import Any, Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2 import Qwen2Config
+from transformers import Qwen2_5_VLProcessor, AutoProcessor
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+class DotsVisionConfig(PretrainedConfig):
+    model_type: str = "dots_vit"
+    def __init__(
+        self,
+        embed_dim: int = 1536,  # vision encoder embed size
+        hidden_size: int = 1536,  # after merger hidden size
+        intermediate_size: int = 4224,
+        num_hidden_layers: int = 42,
+        num_attention_heads: int = 12,
+        num_channels: int = 3,
+        patch_size: int = 14,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 1,
+        rms_norm_eps: float = 1e-5,
+        use_bias: bool = False,
+        attn_implementation="flash_attention_2",  # "eager","sdpa","flash_attention_2"
+        initializer_range=0.02,
+        init_merger_std=0.02,
+        is_causal=False,  # ve causal forward
+        post_norm=True,
+        gradient_checkpointing=False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_bias = use_bias
+        self.attn_implementation = attn_implementation
+        self.initializer_range = initializer_range
+        self.init_merger_std = init_merger_std
+        self.is_causal = is_causal
+        self.post_norm = post_norm
+        self.gradient_checkpointing = gradient_checkpointing
+class DotsOCRConfig(Qwen2Config):
+    model_type = "dots_ocr"
+    def __init__(self,
+        image_token_id = 151665,
+        video_token_id = 151656,
+        vision_config: Optional[dict] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_config = DotsVisionConfig(**(vision_config or {}))
+    def save_pretrained(self, save_directory, **kwargs):
+        self._auto_class = None
+        super().save_pretrained(save_directory, **kwargs)
+class DotsVLProcessor(Qwen2_5_VLProcessor):
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|imgpad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.image_token_id = 151665
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.video_token_id = 151656
+AutoProcessor.register("dots_ocr", DotsVLProcessor)
+CONFIG_MAPPING.register("dots_ocr", DotsOCRConfig)

dots.ocr-1.5 LICENSE AGREEMENT ADDED Viewed

	@@ -0,0 +1,109 @@

+dots.ocr LICENSE AGREEMENT
+Effective Date: [ August 8, 2025]
+Copyright Holder: [Xingyin Information Technology (Shanghai) Co., Ltd]
+This License Agreement (“Agreement”) governs Your use, reproduction, modification, and distribution of dots.ocr (the "Model Materials"). This Agreement is designed to maximize the openness and use of the Model Materials while addressing the unique legal, ethical, and technical challenges posed by large language models.
+WHEREAS, Licensor has developed the dots.ocr document parsing model and intends to distribute the Model Materials under an open‑source framework;
+WHEREAS, traditional open-source licenses (e.g., the MIT License) may not fully address the complexity inherent complexities of document parsing models, namely their multiple components (code, weights, training data), potential ethical risks, data‑governance issues, and intellectual‑property and liability questions regarding AI‑generated content;
+WHEREAS, Licensor seeks to provide a legal framework that ensures maximum access to and use of the Model Materials while clearly defining the rights, obligations, and liabilities of Licensee;
+THEREFORE, the parties agree that, subject to the MIT License, they shall be bound by the following terms and conditions:
+1. Definitions and Interpretation
+Purpose: To define key terms used in this Agreement, particularly "Model Materials," ensuring clarity of the license scope beyond traditional software code. To clarify the order of precedence between this Agreement and the MIT License to avoid conflict.
+1.1 “Licensor” shall mean the entity providing the Model Materials under this Agreement, namely [Xingyin Information Technology (Shanghai) Co., Ltd].
+1.2 “Licensee” or "You" shall mean any individual or entity exercising permissions granted by this Agreement.
+1.3 “Model Materials” shall mean all materials provided by Licensor under this Agreement, including but not limited to:
+        (a) one or more machine‑learning models, including architecture and trained parameters (i.e., model weights);
+        (b) all associated preprocessing, training, inference, and fine‑tuning code;
+        (c) training datasets and evaluation scripts (or their detailed descriptions and access mechanisms); and
+        (d) any accompanying documentation, metadata, and tools.
+The above Model Materials shall be subject to the content published on the Licensor’s website or GitHub repository at https://github.com/rednote-hilab/dots.ocr.
+1.4 “Outputs” shall mean any content generated through the use of the Model Materials, such as text, tables, code,layout information, and formulas extracted from documents.
+1.5 “MIT License” shall mean The MIT Open Source License published by the Massachusetts Institute of Technology.
+1.6   Priority of Agreement. In the event of any conflict or inconsistency between this Agreement and the MIT License, the terms of the MIT License shall prevail. However, if the terms of the MIT License are ambiguous or silent on a particular matter, the provisions of this Agreement shall apply and supplement the MIT License.
+2. Grant of Rights and Scope of Use
+Purpose: To grant broad, permissive rights to the Licensee for the Model Materials—including code, weights, data, and documentation—to ensure maximum openness and flexibility while clarifying the free use of model-generated content. Additionally, it clarifies the feasibility of transitioning from open-source to commercial‑use and the use of OpenAPI interfaces.
+2.1   Grant of Copyright License. Subject to Licensee's compliance with this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non‑exclusive, no-charge, royalty‑free copyright license to use (run or test), reproduce, modify, create derivative works of, merge, publish, distribute the Model Materials; sublicense and/or sell copies of the Model Materials or any derivative works thereof; and incorporate the unmodified or modified Model Materials into proprietary products or services, including for commercial purposes, software‑as‑a‑service (SaaS) offerings, or via OpenAPI or other interfaces.
+2.2   Fundamental Capabilities. The Model Materials only provide the fundamental model’s capabilities. Licensees may develop derivative AI applications or undertake task‑specific training thereon.
+2.3   From Open Source to Commercial Use. The open-source release does not preclude Licensor’s commercial exploitation of the Model Materials, in whole or in part. Any such commercial use shall, at that time, be subject to license agreements between Licensor and applicable users.
+2.4   API‑Service Exception. Licensees who access the Model Materials through API calls or provide model services via API interfaces(without directly distributing model weights )shall not be subject to this Agreement unless otherwise expressly agreed. Instead, such use shall be governed by the API terms of use published by Licensor (if any).
+3. Acceptable Use Policy and Prohibited Uses
+3.1   Responsible Use. Licensee must use the Model Materials in a responsible, ethical, and lawful manner, in compliance with all applicable laws, regulations, industry standards, and best practices.
+3.2   Enterprise On‑Premises Deployment. The Licensee may deploy the Model Materials in closed‑source, on‑premises enterprise environments.
+3.3   Prohibited Uses. Any breach of the prohibitions below will result in the automatic termination of all licenses granted under this Agreement. Licensee agrees not to use the Model Materials or any derivative works thereof, in connection with:
+(a) Identification and Utilization of Illegal/Harmful Content:Includes identifying graphic/text materials used for counterfeiting certificates/invoices, perpetrating fraud, or launching cyberattacks; or processing images containing illegal content such as violence, criminal activities, disinformation, or child exploitation.
+(b) Privacy Infringement and Discriminatory Practices:Extracting personal sensitive information (e.g., ID numbers, medical records, biometric data) or protected characteristics (e.g., race, gender) from images without legal authorization or consent, for purposes of privacy violation, automated discriminatory decision-making, or harassment.
+(c) Copyright Restrictions:Licensees shall not use the tool for unauthorized digitization of publications/document scanning or bulk scraping of content. Any use involving publications or other copyright-protected materials must first obtain relevant permissions.
+4. Intellectual Property Ownership and Contributions
+4.1   Licensor's Copyright Reservation. Licensor reserves all right, title, and interest in and to the Model Materials (including the model architecture, parameters, code, and original training data), except as expressly licensed herein. The original copyright of the Model Materials belongs to the Licensor.
+4.2   Patent License. Subject to the terms and conditions of this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model Materials, where such license applies only to those patent claims licensable by the Lisensor that are necessarily infringed by its contribution(s).
+If Licensee institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model Materials constitute direct or contributory patent infringement, then any patent licenses granted under this License for the Model Materials shall terminate as of the date such litigation is asserted or filed.
+4.3   Outputs: The Outputs generated through the use of the Model Materials generally refer to text, tables, layouts, and other content extracted from documents or images. The extracted content itself does not generate new intellectual property rights, and all intellectual property remains with the original authors or copyright holders. The Licensee is responsible for due diligence regarding the legality of the Outputs, particularly where the content extracted by the OCR model may be substantially similar to existing copyrighted works, which could present intellectual property infringement risks. The Licensor assumes no liability for such infringements.
+4.4   Trademarks. Nothing in this License permits Licensee to make use of Licensor’s trademarks, trade names, logos (e.g., “rednote,” “Xiaohongshu,” “dots.ocr”) or to otherwise suggest endorsement or misrepresent the relationship between the parties, unless Licensor’s prior written approval is granted.
+5. Data Governance, Privacy, and Security
+5.1   Data Quality and Bias. Licensee shall use training data from lawful sources and is encouraged to conduct due diligence before deploying the Model Materials and to take reasonable steps to mitigate any known biases in its training data or applications.
+5.2   Privacy Protection.
+        (a) Sensitive‑Data Restrictions. It is prohibited to use the Model Materials to process,or extract infer sensitive personal data protected under specific laws (such as GDPR or HIPAA), particularly when dealing with documents containing personally identifiable information (such as ID numbers, health data, financial information, etc.), unless Licensee has obtained all necessary consents, lawful basis, or authorizations, and has implemented adequate anonymization, pseudonymization, or other privacy-enhancing technologies.
+        (b) Data Minimization and Purpose Limitation. The Licensee shall follow the principle of data minimization when using the OCR Model, processing only the user data necessary for specific, explicit, and lawful purposes. Specifically, the OCR Model should avoid processing unnecessary sensitive data and ensure compliance with applicable privacy protection laws during data handling.
+        (c) Transparency. Licensee shall provide clear and transparent privacy policies and terms of use when processing user data, particularly during document scanning and information extraction. .
+5.3   Security Measures. Licensee shall implement appropriate technical and administrative safeguards to protect the Model Materials and any associated data against unauthorized access, disclosure, alteration, or destruction. Such measures may include, but are not limited to, encryption, access controls, logging, and audit trails.
+5.4   Further Training. Licensee may only use user‑provided input or Outputs for training, fine-tuning, or improving other AI models if it has obtained the specific and informed consent of data subjects.
+6. Disclaimer of Warranty and Limitation of Liability
+6.1 “AS IS” Basis. Unless required by applicable law, the Model Materials are provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. Licensee is solely responsible for determining the appropriateness of using or redistributing the Model Materials and assume any risks associated with the exercise of permissions under this License. Licensor does not provide any warranty of non-infringement but represents that no infringing code has been knowingly included.
+6.2   Outputs Disclaimer. As a neutral technology, Licensor disclaims all liability for the accuracy, completeness, reliability, safety, legality, or suitability of any Outputs. The Licensee is solely responsible for verifying the accuracy and appropriateness of AI-generated content and shall provide appropriate disclosures when publishing or relying upon such content.
+6.3   Limitation of Liability and Recourse. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall Licensor or contributors be liable for any claims, damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model Materials (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Licensor has been advised of the possibility of such damages. If such losses are incurred, recourse may be sought against the Licensee responsible for causing the loss.
+6.4   Content‑Filtering Disclaimer. Although the Model Materials may include content‑filtering mechanisms, Licensor makes no warranties of any kind regarding the stability, quality, accuracy, completeness, or any specific outcome of Outputs. Licensee is solely responsible for reviewing, verifying, and performing quality control on Outputs and assumes all associated risks and liabilities.
+7. Attribution and License Reservation
+7.1   License. When distributing or redistributing the Model Materials, Licensee must give any other recipients of the Model Materials a copy of this Agreement.
+7.2   Copyright and Notices. When distributing any part of the Model Materials, Licensee must retain all copyright, patent, trademark, and attribution notices included in the Model Materials.
+7.3   Attribution. Licensee is encouraged to prominently display the name of Licensor and the Model Materials in any public statements, products, or services that contain the Model Materials (or any derivative works thereof), to promote transparency and community trust. If Licensee distributes modified weights or fine‑tuned models based on the Model Materials, Licensee must prominently display the following statement in the related website or documentation: “Built with dots.ocr.”
+8. Governing Law and Dispute Resolution
+8.1   Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the People’s Republic of China, without regard to its conflict of laws principles.
+8.2   Dispute Resolution. Any dispute claim, or disagreement arising out of or relating to this Agreement shall first be resolved through amicable consultation. If such consultation fails, the dispute shall be submitted to the Hangzhou Arbitration Commission for arbitration. The arbitration shall be conducted in accordance with the laws of China, and the place of arbitration shall be [Hangzhou, China]. The arbitral award shall be final and binding upon both parties.
+9. Regulatory Compliance Amendments
+In the event that any part of this Agreement becomes invalid or requires adjustment due to changes in applicable laws or regulations, Licensor reserves the right to issue a revised version of this Agreement. Licensee shall migrate to the new version within [e.g., ninety (90)] days of its release; otherwise, all rights granted under this Agreement shall automatically terminate.
+10. Security Reporting
+Licensee discovering any security vulnerability in the Model Materials may report it to Licensor via: dots-feedback@xiaohongshu.com. Licensee shall not disclose vulnerability details until Licensor issues an official remediation, unless otherwise required by law.

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "max_length": 32768,
+  "eos_token_id": [
+    151643,
+    151672,
+    151673
+  ]
+}

merges.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:599bab54075088774b1733fde865d5bd747cbcc7a547c5bc12610e874e26f5e3
+size 1671839

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f4bc75340279da003609fe93f2eb02cc1a77087f5dfb6ba46c0980e1b4da81
+size 4998547840

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0101b36fe6620ba135d4bb8efbbf275fc27b4363b10b7f632c058c0955f3dc4d
+size 1079883896

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,651 @@

+{
+  "metadata": {
+    "total_parameters": 3039179264,
+    "total_size": 6078358528
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.33.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.37.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.40.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.ln_q.bias": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.ln_q.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.2.bias": "model-00002-of-00002.safetensors",
+    "vision_tower.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.norm.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.proj.bias": "model-00002-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.post_trunk_norm.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_dots_ocr.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.qwen2 import Qwen2ForCausalLM
+from .configuration_dots import DotsVisionConfig, DotsOCRConfig
+from .modeling_dots_vision import DotsVisionTransformer
+DOTS_VLM_MAX_IMAGES = 200
+class DotsOCRForCausalLM(Qwen2ForCausalLM):
+    config_class = DotsOCRConfig
+    def __init__(self, config: DotsOCRConfig):
+        super().__init__(config)
+        if isinstance(self.config.vision_config, dict):
+            vision_config = DotsVisionConfig(**self.config.vision_config)
+            self.config.vision_config = vision_config
+        else:
+            vision_config = self.config.vision_config
+        self.vision_tower = DotsVisionTransformer(vision_config)
+    def prepare_inputs_embeds(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        grid_thw: Optional[torch.FloatTensor] = None,
+        img_mask: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        if pixel_values is not None:
+            assert img_mask is not None
+            if grid_thw.shape[0] > DOTS_VLM_MAX_IMAGES:
+                print(
+                    f"Num image exceeded: {grid_thw.shape[0]} > {DOTS_VLM_MAX_IMAGES}, which may cause FSDP hang"
+                )
+            vision_embeddings = self.vision_tower(pixel_values, grid_thw)
+            true_indices = torch.nonzero(img_mask).squeeze()
+            if len(true_indices) > vision_embeddings.size(0):
+                print(
+                    f"img_mask sum > VE and will be truncated, mask.sum()={len(true_indices)} {vision_embeddings.size(0)=}"
+                )
+                true_indices = true_indices[: vision_embeddings.size(0)]
+                new_img_mask = torch.zeros_like(img_mask, device=img_mask.device)
+                new_img_mask[true_indices[:, 0], true_indices[:, 1]] = True
+            else:
+                new_img_mask = img_mask
+            assert (
+                vision_embeddings.size(0) == new_img_mask.sum()
+            ), f"{vision_embeddings.size(0)=}, {new_img_mask.sum()=}"
+            inputs_embeds = inputs_embeds.masked_scatter(
+                new_img_mask.to(inputs_embeds.device).unsqueeze(-1).expand_as(inputs_embeds),
+                vision_embeddings.to(inputs_embeds.device).type(inputs_embeds.dtype),
+            )
+        return inputs_embeds
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        assert len(input_ids) >= 1, f"empty input_ids {input_ids.shape=} will cause gradnorm nan"
+        if inputs_embeds is None:
+            img_mask = input_ids == self.config.image_token_id
+            inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, image_grid_thw, img_mask)
+        outputs = super().forward(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            use_cache=use_cache if use_cache is not None else self.config.use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+            logits_to_keep=logits_to_keep,
+            **loss_kwargs,
+        )
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        return model_inputs

modeling_dots_vision.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from flash_attn import flash_attn_varlen_func
+from torch.nn import LayerNorm
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_dots import DotsVisionConfig
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class PatchMerger(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        pre_norm="layernorm",
+        init_merger_std=None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.pre_norm = pre_norm
+        if self.pre_norm == "layernorm":
+            self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        elif self.pre_norm == "rmsnorm":
+            self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        else:
+            print("no norm in patch merger")
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+        if init_merger_std is not None:
+            nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[0].bias)
+            nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[2].bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm:
+            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        else:
+            x = self.mlp(x.view(-1, self.hidden_size))
+        return x
+class VisionAttention(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.config = config
+        self.is_causal = config.is_causal
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )  # 'shd'
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(
+            q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, causal=self.is_causal
+        ).reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.config = config
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+DOTS_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class DotsSwiGLUFFN(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.embed_dim
+        bias = config.use_bias
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+class DotsPatchEmbed(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.embed_dim,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        x = x.view(-1, self.num_channels, self.temporal_patch_size, self.patch_size, self.patch_size)[:, :, 0]
+        x = self.proj(x).view(-1, self.embed_dim)
+        x = self.norm(x)
+        return x
+class DotsViTPreprocessor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.patch_h = config.patch_size
+        self.patch_w = config.patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.patchifier = DotsPatchEmbed(config)
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        tokens = self.patchifier(x, grid_thw)
+        return tokens
+class DotsVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "flash_attention_2"):
+        super().__init__()
+        self.attn = DOTS_VISION_ATTENTION_CLASSES[attn_implementation](
+            config, config.embed_dim, num_heads=config.num_attention_heads, bias=config.use_bias
+        )
+        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.mlp = DotsSwiGLUFFN(config)
+        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class DotsVisionTransformer(PreTrainedModel):
+    def __init__(self, config: DotsVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = DotsViTPreprocessor(config)
+        self._init_weights(self.patch_embed.patchifier.proj)
+        head_dim = config.embed_dim // config.num_attention_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        _num_hidden_layers = config.num_hidden_layers
+        self.blocks = nn.ModuleList(
+            [DotsVisionBlock(config, config.attn_implementation) for _ in range(_num_hidden_layers)]
+        )
+        if self.config.post_norm:
+            self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.merger = PatchMerger(
+            dim=config.hidden_size,
+            context_dim=config.embed_dim,
+            spatial_merge_size=config.spatial_merge_size,
+            init_merger_std=self.config.init_merger_std,
+        )
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def get_pos_ids_by_grid(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            )
+        return pos_ids
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = self.get_pos_ids_by_grid(grid_thw)
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True) -> torch.Tensor:
+        if bf16:
+            hidden_states = hidden_states.bfloat16()
+        hidden_states = self.patch_embed(hidden_states, grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb,
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        if self.config.post_norm:
+            hidden_states = self.post_trunk_norm(hidden_states)
+        hidden_states = self.merger(hidden_states)
+        return hidden_states

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "auto_map": {
+        "AutoProcessor": "configuration_dots.DotsVLProcessor"
+    },
+  "min_pixels": 3136,
+  "max_pixels": 11289600,
+  "patch_size": 14,
+  "temporal_patch_size": 1,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "DotsVLProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "[PAD]"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386545eb05f08c51352cde2fcc2c867f1592bb330f305efd1c6a57a93b1244cd
+size 7036028

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,391 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|imgpad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|img|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|endofimg|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|systemprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|endofsystemprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|endofuser|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|endofassistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<|pic|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<|text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<|pictotext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<|slice|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<|endofslice|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<|imgrowend|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<|polygon_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<|polygon_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<|image_gen_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<|image_gen_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- for m in messages %}\n    {%- if m.role == 'system' %}\n        {{- '<|system|>' + m.content + '<|endofsystem|>\\n' }}\n    {%- elif m.role == 'user' %}\n        {{- '<|user|>' + m.content + '<|endofuser|>' }}\n    {%- elif m.role == 'assistant' %}\n        {{- '<|assistant|>' + m.content }}\n        {%- if not loop.last %}\n            {{- '<|endofassistant|>' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if messages[-1].role != 'assistant' %}\n    {{- '<|assistant|>' }}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "[PAD]",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
+size 2776833