Spaces:

HorizonRobotics
/

EmbodiedGen-Image-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on 1 day ago

Commit

5638c1f

1 Parent(s): 22afe09

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +1 -1
embodied_gen/models/sam3d.py +3 -2
embodied_gen/utils/monkey_patches.py +4 -8
thirdparty/sam3d/sam3d/.gitignore +1 -0
thirdparty/sam3d/sam3d/CODE_OF_CONDUCT.md +80 -0
thirdparty/sam3d/sam3d/CONTRIBUTING.md +39 -0
thirdparty/sam3d/sam3d/LICENSE +52 -0
thirdparty/sam3d/sam3d/README.md +152 -0
thirdparty/sam3d/sam3d/checkpoints/.gitignore +2 -0
thirdparty/sam3d/sam3d/demo.py +21 -0
thirdparty/sam3d/sam3d/doc/setup.md +58 -0
thirdparty/sam3d/sam3d/environments/default.yml +216 -0
thirdparty/sam3d/sam3d/notebook/demo_3db_mesh_alignment.ipynb +149 -0
thirdparty/sam3d/sam3d/notebook/demo_multi_object.ipynb +162 -0
thirdparty/sam3d/sam3d/notebook/demo_single_object.ipynb +164 -0
thirdparty/sam3d/sam3d/notebook/inference.py +414 -0
thirdparty/sam3d/sam3d/notebook/mesh_alignment.py +469 -0
thirdparty/sam3d/sam3d/patching/hydra +16 -0
thirdparty/sam3d/sam3d/pyproject.toml +30 -0
thirdparty/sam3d/sam3d/requirements.dev.txt +4 -0
thirdparty/sam3d/sam3d/requirements.inference.txt +4 -0
thirdparty/sam3d/sam3d/requirements.p3d.txt +2 -0
thirdparty/sam3d/sam3d/requirements.txt +88 -0
thirdparty/sam3d/sam3d/sam3d_objects/__init__.py +6 -0
thirdparty/sam3d/sam3d/sam3d_objects/config/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/config/utils.py +174 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/img_and_mask_transforms.py +986 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/img_processing.py +189 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/pose_target.py +784 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/preprocessor.py +203 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/transforms_3d.py +50 -0
thirdparty/sam3d/sam3d/sam3d_objects/data/utils.py +243 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/dino.py +142 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/embedder_fuser.py +238 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/point_remapper.py +78 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/pointmap.py +238 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/base.py +65 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/classifier_free_guidance.py +259 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/__init__.py +1 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/model.py +363 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/solver.py +126 -0
thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/shortcut/__init__.py +1 -0

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ pinned: false
 license: apache-2.0
 short_description: Generate physically plausible 3D model from single image.
 paper: https://huggingface.co/papers/2506.10600
-startup_duration_timeout: 2h
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: apache-2.0
 short_description: Generate physically plausible 3D model from single image.
 paper: https://huggingface.co/papers/2506.10600
+startup_duration_timeout: 4h
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

embodied_gen/models/sam3d.py CHANGED Viewed

@@ -94,9 +94,10 @@ class Sam3dInference:
     ) -> dict:
         if isinstance(image, Image.Image):
             image = np.array(image)
         return self.pipeline.run(
             image,
-            mask,
             seed,
             stage1_only=False,
             with_mesh_postprocess=False,
@@ -132,7 +133,7 @@ if __name__ == "__main__":
     start = time()
-    output = pipeline(image, mask, seed=42)
     print(f"Running cost: {round(time()-start, 1)}")
     if torch.cuda.is_available():

     ) -> dict:
         if isinstance(image, Image.Image):
             image = np.array(image)
+        image = self.merge_mask_to_rgba(image, mask)
         return self.pipeline.run(
             image,
+            None,
             seed,
             stage1_only=False,
             with_mesh_postprocess=False,
     start = time()
+    output = pipeline.run(image, mask, seed=42)
     print(f"Running cost: {round(time()-start, 1)}")
     if torch.cuda.is_available():

embodied_gen/utils/monkey_patches.py CHANGED Viewed

@@ -397,17 +397,13 @@ def monkey_patch_sam3d():
                         exc_info=True,
                     )
-                # glb.export("sample.glb")
-                logger.info("Finished!")
-                return {
                     **ss_return_dict,
                     **outputs,
-                    "pointmap": pts.cpu().permute((1, 2, 0)),  # HxWx3
-                    "pointmap_colors": pts_colors.cpu().permute(
-                        (1, 2, 0)
-                    ),  # HxWx3
                 }
         InferencePipelinePointMap.run = patch_run

                         exc_info=True,
                     )
+                result = {
                     **ss_return_dict,
                     **outputs,
+                    "pointmap": pts.cpu().permute((1, 2, 0)),
+                    "pointmap_colors": pts_colors.cpu().permute((1, 2, 0)),
                 }
+                return result
         InferencePipelinePointMap.run = patch_run

thirdparty/sam3d/sam3d/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

thirdparty/sam3d/sam3d/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

thirdparty/sam3d/sam3d/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Contributing to sam-3d-objects
+We want to make contributing to this project as easy and transparent as
+possible.
+## Our Development Process
+... (in particular how this is synced with internal changes to the project)
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Meta has a [bounty program](https://bugbounty.meta.com/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## Coding Style
+* 2 spaces for indentation rather than tabs
+* 80 character line length
+* ...
+## License
+By contributing to sam-3d-objects, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

thirdparty/sam3d/sam3d/LICENSE ADDED Viewed

	@@ -0,0 +1,52 @@

+SAM License
+Last Updated: November 19, 2025
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the SAM Materials set forth herein.
+“SAM Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, and other elements of the foregoing distributed by Meta and made available under this Agreement.
+“Documentation” means the specifications, manuals and documentation accompanying
+SAM Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) or Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Sanctions” means any economic or trade sanctions or restrictions administered or enforced by the United States (including the Office of Foreign Assets Control of the U.S. Department of the Treasury (“OFAC”), the U.S. Department of State and the U.S. Department of Commerce), the United Nations, the European Union, or the United Kingdom.
+“Trade Controls” means any of the following: Sanctions and applicable export and import controls.
+By using or distributing any portion or element of the SAM Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the SAM Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the SAM Materials.
+i. Grant of Patent License. Subject to the terms and conditions of this License, you are granted a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by Meta that are necessarily infringed alone or by combination of their contribution(s) with the SAM 3 Materials. If you institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the SAM 3 Materials incorporated within the work constitutes direct or contributory patent infringement, then any patent licenses granted to you under this License for that work shall terminate as of the date such litigation is filed.
+b. Redistribution and Use.
+i. Distribution of SAM Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the SAM Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement and you shall provide a copy of this Agreement with any such SAM Materials.
+ii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with SAM Materials, you must acknowledge the use of SAM Materials in your publication.
+iii. Your use of the SAM Materials must comply with applicable laws and regulations, including Trade Control Laws and applicable privacy and data protection laws.
+iv. Your use of the SAM Materials will not involve or encourage others to reverse engineer, decompile or discover the underlying components of the SAM Materials.
+v. You are not the target of Trade Controls and your use of SAM Materials must comply with Trade Controls. You agree not to use, or permit others to use, SAM Materials for any activities subject to the International Traffic in Arms Regulations (ITAR) or end uses prohibited by Trade Controls, including those related to military or warfare purposes, nuclear industries or applications, espionage, or the development or use of guns or illegal weapons.
+2. User Support. Your use of the SAM Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the SAM Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SAM MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SAM MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SAM MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of SAM Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the SAM Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the SAM Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the SAM Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the SAM Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the SAM Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the SAM Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.

thirdparty/sam3d/sam3d/README.md ADDED Viewed

	@@ -0,0 +1,152 @@

+# SAM 3D
+SAM 3D Objects is one part of SAM 3D, a pair of models for object and human mesh reconstruction.  If you’re looking for SAM 3D Body, [click here](https://github.com/facebookresearch/sam-3d-body).
+# SAM 3D Objects
+**SAM 3D Team**, [Xingyu Chen](https://scholar.google.com/citations?user=gjSHr6YAAAAJ&hl=en&oi=sra)\*, [Fu-Jen Chu](https://fujenchu.github.io/)\*, [Pierre Gleize](https://scholar.google.com/citations?user=4imOcw4AAAAJ&hl=en&oi=ao)\*, [Kevin J Liang](https://kevinjliang.github.io/)\*, [Alexander Sax](https://alexsax.github.io/)\*, [Hao Tang](https://scholar.google.com/citations?user=XY6Nh9YAAAAJ&hl=en&oi=sra)\*, [Weiyao Wang](https://sites.google.com/view/weiyaowang/home)\*, [Michelle Guo](https://scholar.google.com/citations?user=lyjjpNMAAAAJ&hl=en&oi=ao), [Thibaut Hardin](https://github.com/Thibaut-H), [Xiang Li](https://ryanxli.github.io/)⚬, [Aohan Lin](https://github.com/linaohan), [Jia-Wei Liu](https://jia-wei-liu.github.io/), [Ziqi Ma](https://ziqi-ma.github.io/)⚬, [Anushka Sagar](https://www.linkedin.com/in/anushkasagar/), [Bowen Song](https://scholar.google.com/citations?user=QQKVkfcAAAAJ&hl=en&oi=sra)⚬, [Xiaodong Wang](https://scholar.google.com/citations?authuser=2&user=rMpcFYgAAAAJ), [Jianing Yang](https://jedyang.com/)⚬, [Bowen Zhang](http://home.ustc.edu.cn/~zhangbowen/)⚬, [Piotr Dollár](https://pdollar.github.io/)†, [Georgia Gkioxari](https://georgiagkioxari.com/)†, [Matt Feiszli](https://scholar.google.com/citations?user=A-wA73gAAAAJ&hl=en&oi=ao)†§, [Jitendra Malik](https://people.eecs.berkeley.edu/~malik/)†§
+***Meta Superintelligence Labs***
+*Core contributor (Alphabetical, Equal Contribution), ⚬Intern, †Project leads, §Equal Contribution
+[[`Paper`](https://ai.meta.com/research/publications/sam-3d-3dfy-anything-in-images/)] [[`Code`](https://github.com/facebookresearch/sam-3d-objects)] [[`Website`](https://ai.meta.com/sam3d/)] [[`Demo`](https://www.aidemos.meta.com/segment-anything/editor/convert-image-to-3d)] [[`Blog`](https://ai.meta.com/blog/sam-3d/)] [[`BibTeX`](#citing-sam-3d-objects)] [[`Roboflow`](https://blog.roboflow.com/sam-3d/)]
+**SAM 3D Objects** is a foundation model that reconstructs full 3D shape geometry, texture, and layout from a single image, excelling in real-world scenarios with occlusion and clutter by using progressive training and a data engine with human feedback. It outperforms prior 3D generation models in human preference tests on real-world objects and scenes. We released code, weights, online demo, and a new challenging benchmark.
+<p align="center"><img src="doc/intro.png"/></p>
+-----
+<p align="center"><img src="doc/arch.png"/></p>
+## Latest updates
+**11/19/2025** - Checkpoints Launched, Web Demo and Paper are out.
+## Installation
+Follow the [setup](doc/setup.md) steps before running the following.
+## Single or Multi-Object 3D Generation
+SAM 3D Objects can convert masked objects in an image, into 3D models with pose, shape, texture, and layout. SAM 3D is designed to be robust in challenging natural images, handling small objects and occlusions, unusual poses, and difficult situations encountered in uncurated natural scenes like this kidsroom:
+<p align="center">
+  <img src="notebook/images/shutterstock_stylish_kidsroom_1640806567/image.png" width="55%"/>
+  <img src="doc/kidsroom_transparent.gif" width="40%"/>
+</p>
+For a quick start, run `python demo.py` or use the the following lines of code:
+```python
+import sys
+# import inference code
+sys.path.append("notebook")
+from inference import Inference, load_image, load_single_mask
+# load model
+tag = "hf"
+config_path = f"checkpoints/{tag}/pipeline.yaml"
+inference = Inference(config_path, compile=False)
+# load image and mask
+image = load_image("notebook/images/shutterstock_stylish_kidsroom_1640806567/image.png")
+mask = load_single_mask("notebook/images/shutterstock_stylish_kidsroom_1640806567", index=14)
+# run model
+output = inference(image, mask, seed=42)
+# export gaussian splat
+output["gs"].save_ply(f"splat.ply")
+```
+For  more details and multi-object reconstruction, please take a look at out two jupyter notebooks:
+* [single object](notebook/demo_single_object.ipynb)
+* [multi object](notebook/demo_multi_object.ipynb)
+## SAM 3D Body
+[SAM 3D Body (3DB)](https://github.com/facebookresearch/sam-3d-body) is a robust promptable foundation model for single-image 3D human mesh recovery (HMR).
+As a way to combine the strengths of both **SAM 3D Objects** and **SAM 3D Body**, we provide an example notebook that demonstrates how to combine the results of both models such that they are aligned in the same frame of reference. Check it out [here](notebook/demo_3db_mesh_alignment.ipynb).
+## License
+The SAM 3D Objects model checkpoints and code are licensed under [SAM License](./LICENSE).
+## Contributing
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+## Contributors
+The SAM 3D Objects project was made possible with the help of many contributors.
+Robbie Adkins,
+Paris Baptiste,
+Karen Bergan,
+Kai Brown,
+Michelle Chan,
+Ida Cheng,
+Khadijat Durojaiye,
+Patrick Edwards,
+Daniella Factor,
+Facundo Figueroa,
+Rene  de la Fuente,
+Eva Galper,
+Cem Gokmen,
+Alex He,
+Enmanuel Hernandez,
+Dex Honsa,
+Leonna Jones,
+Arpit Kalla,
+Kris Kitani,
+Helen Klein,
+Kei Koyama,
+Robert Kuo,
+Vivian Lee,
+Alex Lende,
+Jonny Li,
+Kehan Lyu,
+Faye Ma,
+Mallika Malhotra,
+Sasha Mitts,
+William Ngan,
+George Orlin,
+Peter Park,
+Don Pinkus,
+Roman Radle,
+Nikhila Ravi,
+Azita Shokrpour,
+Jasmine Shone,
+Zayida Suber,
+Phillip Thomas,
+Tatum Turner,
+Joseph Walker,
+Meng Wang,
+Claudette Ward,
+Andrew Westbury,
+Lea Wilken,
+Nan Yang,
+Yael Yungster
+## Citing SAM 3D Objects
+If you use SAM 3D Objects in your research, please use the following BibTeX entry.
+```
+@article{sam3dteam2025sam3d3dfyimages,
+      title={SAM 3D: 3Dfy Anything in Images},
+      author={SAM 3D Team and Xingyu Chen and Fu-Jen Chu and Pierre Gleize and Kevin J Liang and Alexander Sax and Hao Tang and Weiyao Wang and Michelle Guo and Thibaut Hardin and Xiang Li and Aohan Lin and Jiawei Liu and Ziqi Ma and Anushka Sagar and Bowen Song and Xiaodong Wang and Jianing Yang and Bowen Zhang and Piotr Dollár and Georgia Gkioxari and Matt Feiszli and Jitendra Malik},
+      year={2025},
+      eprint={2511.16624},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2511.16624},
+}
+```

thirdparty/sam3d/sam3d/checkpoints/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

thirdparty/sam3d/sam3d/demo.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+# import inference code
+sys.path.append("notebook")
+from inference import Inference, load_image, load_single_mask
+# load model
+tag = "hf"
+config_path = f"checkpoints/{tag}/pipeline.yaml"
+inference = Inference(config_path, compile=False)
+# load image (RGBA only, mask is embedded in the alpha channel)
+image = load_image("notebook/images/shutterstock_stylish_kidsroom_1640806567/image.png")
+mask = load_single_mask("notebook/images/shutterstock_stylish_kidsroom_1640806567", index=14)
+# run model
+output = inference(image, mask, seed=42)
+# export gaussian splat
+output["gs"].save_ply(f"splat.ply")
+print("Your reconstruction has been saved to splat.ply")

thirdparty/sam3d/sam3d/doc/setup.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Setup
+## Prerequisites
+* A linux 64-bits architecture (i.e. `linux-64` platform in `mamba info`).
+* A NVIDIA GPU with at least 32 Gb of VRAM.
+## 1. Setup Python Environment
+The following will install the default environment. If you use `conda` instead of `mamba`, replace its name in the first two lines. Note that you may have to build the environment on a compute node with GPU (e.g., you may get a `RuntimeError: Not compiled with GPU support` error when running certain parts of the code that use Pytorch3D).
+```bash
+# create sam3d-objects environment
+mamba env create -f environments/default.yml
+mamba activate sam3d-objects
+# for pytorch/cuda dependencies
+export PIP_EXTRA_INDEX_URL="https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu121"
+# install sam3d-objects and core dependencies
+pip install -e '.[dev]'
+pip install -e '.[p3d]' # pytorch3d dependency on pytorch is broken, this 2-step approach solves it
+# for inference
+export PIP_FIND_LINKS="https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html"
+pip install -e '.[inference]'
+# patch things that aren't yet in official pip packages
+./patching/hydra # https://github.com/facebookresearch/hydra/pull/2863
+```
+## 2. Getting Checkpoints
+### From HuggingFace
+⚠️ Before using SAM 3D Objects, please request access to the checkpoints on the SAM 3D Objects
+Hugging Face [repo](https://huggingface.co/facebook/sam-3d-objects). Once accepted, you
+need to be authenticated to download the checkpoints. You can do this by running
+the following [steps](https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication)
+(e.g. `hf auth login` after generating an access token).
+⚠️ SAM 3D Objects is available via HuggingFace globally, **except** in comprehensively sanctioned jurisdictions.
+Sanctioned jurisdiction will result in requests being **rejected**.
+```bash
+pip install 'huggingface-hub[cli]<1.0'
+TAG=hf
+hf download \
+  --repo-type model \
+  --local-dir checkpoints/${TAG}-download \
+  --max-workers 1 \
+  facebook/sam-3d-objects
+mv checkpoints/${TAG}-download/checkpoints checkpoints/${TAG}
+rm -rf checkpoints/${TAG}-download
+```

thirdparty/sam3d/sam3d/environments/default.yml ADDED Viewed

	@@ -0,0 +1,216 @@

+name: sam3d-objects
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - alsa-lib=1.2.13=hb9d3cd8_0
+  - attr=2.5.1=h166bdaf_1
+  - binutils=2.43=h4852527_4
+  - binutils_impl_linux-64=2.43=h4bf12b8_4
+  - binutils_linux-64=2.43=h4852527_4
+  - bzip2=1.0.8=h4bc722e_7
+  - c-compiler=1.7.0=hd590300_1
+  - ca-certificates=2025.1.31=hbcca054_0
+  - cairo=1.18.0=h3faef2a_0
+  - cuda-cccl=12.1.109=ha770c72_0
+  - cuda-cccl-impl=2.0.1=ha770c72_1
+  - cuda-cccl_linux-64=12.1.109=ha770c72_0
+  - cuda-command-line-tools=12.1.1=ha770c72_0
+  - cuda-compiler=12.1.1=hbad6d8a_0
+  - cuda-cudart=12.1.105=hd3aeb46_0
+  - cuda-cudart-dev=12.1.105=hd3aeb46_0
+  - cuda-cudart-dev_linux-64=12.1.105=h59595ed_0
+  - cuda-cudart-static=12.1.105=hd3aeb46_0
+  - cuda-cudart-static_linux-64=12.1.105=h59595ed_0
+  - cuda-cudart_linux-64=12.1.105=h59595ed_0
+  - cuda-cuobjdump=12.1.111=h59595ed_0
+  - cuda-cupti=12.1.105=h59595ed_0
+  - cuda-cupti-dev=12.1.105=h59595ed_0
+  - cuda-cuxxfilt=12.1.105=h59595ed_0
+  - cuda-driver-dev=12.1.105=hd3aeb46_0
+  - cuda-driver-dev_linux-64=12.1.105=h59595ed_0
+  - cuda-gdb=12.1.105=hd47b8d6_0
+  - cuda-libraries=12.1.1=ha770c72_0
+  - cuda-libraries-dev=12.1.1=ha770c72_0
+  - cuda-nsight=12.1.105=ha770c72_0
+  - cuda-nvcc=12.1.105=hcdd1206_1
+  - cuda-nvcc-dev_linux-64=12.1.105=ha770c72_0
+  - cuda-nvcc-impl=12.1.105=hd3aeb46_0
+  - cuda-nvcc-tools=12.1.105=hd3aeb46_0
+  - cuda-nvcc_linux-64=12.1.105=h8a487aa_1
+  - cuda-nvdisasm=12.1.105=h59595ed_0
+  - cuda-nvml-dev=12.1.105=h59595ed_0
+  - cuda-nvprof=12.1.105=h59595ed_0
+  - cuda-nvprune=12.1.105=h59595ed_0
+  - cuda-nvrtc=12.1.105=hd3aeb46_0
+  - cuda-nvrtc-dev=12.1.105=hd3aeb46_0
+  - cuda-nvtx=12.1.105=h59595ed_0
+  - cuda-nvvp=12.1.105=h59595ed_0
+  - cuda-opencl=12.1.105=h59595ed_0
+  - cuda-opencl-dev=12.1.105=h59595ed_0
+  - cuda-profiler-api=12.1.105=ha770c72_0
+  - cuda-sanitizer-api=12.1.105=h59595ed_0
+  - cuda-toolkit=12.1.1=ha804496_0
+  - cuda-tools=12.1.1=ha770c72_0
+  - cuda-version=12.1=h1d6eff3_3
+  - cuda-visual-tools=12.1.1=ha770c72_0
+  - cxx-compiler=1.7.0=h00ab1b0_1
+  - dbus=1.13.6=h5008d03_3
+  - expat=2.6.4=h5888daf_0
+  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+  - font-ttf-inconsolata=3.000=h77eed37_0
+  - font-ttf-source-code-pro=2.038=h77eed37_0
+  - font-ttf-ubuntu=0.83=h77eed37_3
+  - fontconfig=2.15.0=h7e30c49_1
+  - fonts-conda-ecosystem=1=0
+  - fonts-conda-forge=1=0
+  - freetype=2.13.3=h48d6fc4_0
+  - gcc=12.4.0=h236703b_2
+  - gcc_impl_linux-64=12.4.0=h26ba24d_2
+  - gcc_linux-64=12.4.0=h6b7512a_8
+  - gds-tools=1.6.1.9=hd3aeb46_0
+  - gettext=0.23.1=h5888daf_0
+  - gettext-tools=0.23.1=h5888daf_0
+  - glib=2.82.2=h07242d1_1
+  - glib-tools=2.82.2=h4833e2c_1
+  - gmp=6.3.0=hac33072_2
+  - graphite2=1.3.13=h59595ed_1003
+  - gst-plugins-base=1.24.4=h9ad1361_0
+  - gstreamer=1.24.4=haf2f30d_0
+  - gxx=12.4.0=h236703b_2
+  - gxx_impl_linux-64=12.4.0=h3ff227c_2
+  - gxx_linux-64=12.4.0=h8489865_8
+  - harfbuzz=8.5.0=hfac3d4d_0
+  - icu=73.2=h59595ed_0
+  - kernel-headers_linux-64=3.10.0=he073ed8_18
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.21.3=h659f571_0
+  - lame=3.100=h166bdaf_1003
+  - ld_impl_linux-64=2.43=h712a8e2_4
+  - libasprintf=0.23.1=h8e693c7_0
+  - libasprintf-devel=0.23.1=h8e693c7_0
+  - libcap=2.75=h39aace5_0
+  - libclang-cpp15=15.0.7=default_h127d8a8_5
+  - libclang13=19.1.2=default_h9c6a7e4_1
+  - libcublas=12.1.3.1=hd3aeb46_0
+  - libcublas-dev=12.1.3.1=hd3aeb46_0
+  - libcufft=11.0.2.54=hd3aeb46_0
+  - libcufft-dev=11.0.2.54=hd3aeb46_0
+  - libcufile=1.6.1.9=hd3aeb46_0
+  - libcufile-dev=1.6.1.9=hd3aeb46_0
+  - libcups=2.3.3=h4637d8d_4
+  - libcurand=10.3.2.106=hd3aeb46_0
+  - libcurand-dev=10.3.2.106=hd3aeb46_0
+  - libcusolver=11.4.5.107=hd3aeb46_0
+  - libcusolver-dev=11.4.5.107=hd3aeb46_0
+  - libcusparse=12.1.0.106=hd3aeb46_0
+  - libcusparse-dev=12.1.0.106=hd3aeb46_0
+  - libedit=3.1.20250104=pl5321h7949ede_0
+  - libevent=2.1.12=hf998b51_1
+  - libexpat=2.6.4=h5888daf_0
+  - libffi=3.4.6=h2dba641_0
+  - libflac=1.4.3=h59595ed_0
+  - libgcc=14.2.0=h767d61c_2
+  - libgcc-devel_linux-64=12.4.0=h1762d19_102
+  - libgcc-ng=14.2.0=h69a702a_2
+  - libgcrypt-lib=1.11.0=hb9d3cd8_2
+  - libgettextpo=0.23.1=h5888daf_0
+  - libgettextpo-devel=0.23.1=h5888daf_0
+  - libglib=2.82.2=h2ff4ddf_1
+  - libgomp=14.2.0=h767d61c_2
+  - libgpg-error=1.51=hbd13f7d_1
+  - libiconv=1.18=h4ce23a2_1
+  - libjpeg-turbo=3.0.0=hd590300_1
+  - libllvm15=15.0.7=hb3ce162_4
+  - libllvm19=19.1.2=ha7bfdaf_0
+  - liblzma=5.6.4=hb9d3cd8_0
+  - liblzma-devel=5.6.4=hb9d3cd8_0
+  - libnpp=12.1.0.40=hd3aeb46_0
+  - libnpp-dev=12.1.0.40=hd3aeb46_0
+  - libnsl=2.0.1=hd590300_0
+  - libnuma=2.0.18=h4ab18f5_2
+  - libnvjitlink=12.1.105=hd3aeb46_0
+  - libnvjitlink-dev=12.1.105=hd3aeb46_0
+  - libnvjpeg=12.2.0.2=h59595ed_0
+  - libnvjpeg-dev=12.2.0.2=ha770c72_0
+  - libogg=1.3.5=h4ab18f5_0
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.47=h943b412_0
+  - libpq=16.8=h87c4ccc_0
+  - libsanitizer=12.4.0=ha732cd4_2
+  - libsndfile=1.2.2=hc60ed4a_1
+  - libsqlite=3.49.1=hee588c1_2
+  - libstdcxx=14.2.0=h8f9b012_2
+  - libstdcxx-devel_linux-64=12.4.0=h1762d19_102
+  - libstdcxx-ng=14.2.0=h4852527_2
+  - libsystemd0=257.4=h4e0b6ca_1
+  - libuuid=2.38.1=h0b41bf4_0
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libxcb=1.15=h0b41bf4_0
+  - libxkbcommon=1.7.0=h662e7e4_0
+  - libxkbfile=1.1.0=h166bdaf_1
+  - libxml2=2.12.7=h4c95cb1_3
+  - libzlib=1.3.1=hb9d3cd8_2
+  - lz4-c=1.10.0=h5888daf_1
+  - mpg123=1.32.9=hc50e24c_0
+  - mysql-common=8.3.0=h70512c7_5
+  - mysql-libs=8.3.0=ha479ceb_5
+  - ncurses=6.5=h2d0b736_3
+  - nsight-compute=2023.1.1.4=h3718151_0
+  - nspr=4.36=h5888daf_0
+  - nss=3.108=h159eef7_0
+  - ocl-icd=2.3.2=hb9d3cd8_2
+  - opencl-headers=2024.10.24=h5888daf_0
+  - openssl=3.4.1=h7b32b05_0
+  - packaging=24.2=pyhd8ed1ab_2
+  - pcre2=10.44=hba22ea6_2
+  - pip=25.0.1=pyh8b19718_0
+  - pixman=0.44.2=h29eaf8c_0
+  - pthread-stubs=0.4=hb9d3cd8_1002
+  - pulseaudio-client=17.0=hb77b528_0
+  - python=3.11.0=he550d4f_1_cpython
+  - qt-main=5.15.8=hc9dc06e_21
+  - readline=8.2=h8c095d6_2
+  - setuptools=75.8.2=pyhff2d567_0
+  - sysroot_linux-64=2.17=h0157908_18
+  - tk=8.6.13=noxft_h4845f30_101
+  - tzdata=2025b=h78e105d_0
+  - wayland=1.23.1=h3e06ad9_0
+  - wheel=0.45.1=pyhd8ed1ab_1
+  - xcb-util=0.4.0=hd590300_1
+  - xcb-util-image=0.4.0=h8ee46fc_1
+  - xcb-util-keysyms=0.4.0=h8ee46fc_1
+  - xcb-util-renderutil=0.3.9=hd590300_1
+  - xcb-util-wm=0.4.1=h8ee46fc_1
+  - xkeyboard-config=2.42=h4ab18f5_0
+  - xorg-compositeproto=0.4.2=hb9d3cd8_1002
+  - xorg-damageproto=1.2.1=hb9d3cd8_1003
+  - xorg-fixesproto=5.0=hb9d3cd8_1003
+  - xorg-inputproto=2.3.2=hb9d3cd8_1003
+  - xorg-kbproto=1.0.7=hb9d3cd8_1003
+  - xorg-libice=1.1.2=hb9d3cd8_0
+  - xorg-libsm=1.2.6=he73a12e_0
+  - xorg-libx11=1.8.9=h8ee46fc_0
+  - xorg-libxau=1.0.12=hb9d3cd8_0
+  - xorg-libxcomposite=0.4.6=h0b41bf4_1
+  - xorg-libxdamage=1.1.5=h7f98852_1
+  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
+  - xorg-libxext=1.3.4=h0b41bf4_2
+  - xorg-libxfixes=5.0.3=h7f98852_1004
+  - xorg-libxi=1.7.10=h4bc722e_1
+  - xorg-libxrandr=1.5.2=h7f98852_1
+  - xorg-libxrender=0.9.11=hd590300_0
+  - xorg-libxtst=1.2.5=h4bc722e_0
+  - xorg-randrproto=1.5.0=hb9d3cd8_1002
+  - xorg-recordproto=1.14.2=hb9d3cd8_1003
+  - xorg-renderproto=0.11.1=hb9d3cd8_1003
+  - xorg-util-macros=1.20.2=hb9d3cd8_0
+  - xorg-xextproto=7.3.0=hb9d3cd8_1004
+  - xorg-xf86vidmodeproto=2.3.1=hb9d3cd8_1005
+  - xorg-xproto=7.0.31=hb9d3cd8_1008
+  - xz=5.6.4=hbcc6ac9_0
+  - xz-gpl-tools=5.6.4=hbcc6ac9_0
+  - xz-tools=5.6.4=hb9d3cd8_0
+  - zlib=1.3.1=hb9d3cd8_2
+  - zstd=1.5.7=hb8e6e7a_2

thirdparty/sam3d/sam3d/notebook/demo_3db_mesh_alignment.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SAM 3D Body (3DB) Mesh Alignment to SAM 3D Object Scale\n",
+    "\n",
+    "This notebook processes a single 3DB mesh and aligns it to the SAM 3D Objects scale.\n",
+    "\n",
+    "**Input Data:**\n",
+    "- `images/human_object/image.jpg` - Input image for MoGe\n",
+    "- `meshes/human_object/3DB_results/mask_human.png` - Human mask\n",
+    "- `meshes/human_object/3DB_results/human.ply` - Single 3DB mesh in OpenGL coordinates\n",
+    "- `meshes/human_object/3DB_results/focal_length.json` - 3DB focal length\n",
+    "\n",
+    "**Output:**\n",
+    "- `meshes/human_object/aligned_meshes/human_aligned.ply` - Aligned 3DB mesh in OpenGL coordinates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "from mesh_alignment import process_and_save_alignment\n",
+    "\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "print(f\"Using device: {device}\")\n",
+    "PATH = os.getcwd()\n",
+    "print(f\"Current working directory: {PATH}\")\n",
+    "\n",
+    "# Please inference the SAM 3D Body (3DB) Repo (https://github.com/facebookresearch/sam-3d-body) to get the 3DB Results\n",
+    "image_path = f\"{PATH}/images/human_object/image.png\"\n",
+    "mask_path = f\"{PATH}/meshes/human_object/3DB_results/mask_human.png\"\n",
+    "mesh_path = f\"{PATH}/meshes/human_object/3DB_results/human.ply\"\n",
+    "focal_length_json_path = f\"{PATH}/meshes/human_object/3DB_results/focal_length.json\"\n",
+    "output_dir = f\"{PATH}/meshes/human_object/aligned_meshes\"\n",
+    "os.makedirs(output_dir, exist_ok=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load and Display Input Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_image = Image.open(image_path)\n",
+    "mask = Image.open(mask_path).convert('L')\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(10, 5))\n",
+    "axes[0].imshow(input_image)\n",
+    "axes[0].set_title('Input Image')\n",
+    "axes[0].axis('off')\n",
+    "axes[1].imshow(mask, cmap='gray')\n",
+    "axes[1].set_title('Mask')\n",
+    "axes[1].axis('off')\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Process and Save Aligned Mesh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "success, output_mesh_path, result = process_and_save_alignment(\n",
+    "    mesh_path=mesh_path,\n",
+    "    mask_path=mask_path,\n",
+    "    image_path=image_path,\n",
+    "    output_dir=output_dir,\n",
+    "    device=device,\n",
+    "    focal_length_json_path=focal_length_json_path\n",
+    ")\n",
+    "\n",
+    "if success:\n",
+    "    print(f\"Alignment completed successfully! Output: {output_mesh_path}\")\n",
+    "else:\n",
+    "    print(\"Alignment failed!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Interactive 3D Visualization\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mesh_alignment import visualize_meshes_interactive\n",
+    "\n",
+    "aligned_mesh_path = f\"{PATH}/meshes/human_object/aligned_meshes/human_aligned.ply\"\n",
+    "dfy_mesh_path = f\"{PATH}/meshes/human_object/3Dfy_results/0.glb\"\n",
+    "\n",
+    "demo, combined_glb_path = visualize_meshes_interactive(\n",
+    "    aligned_mesh_path=aligned_mesh_path,\n",
+    "    dfy_mesh_path=dfy_mesh_path,\n",
+    "    share=True\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sam3d_objects-3dfy",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

thirdparty/sam3d/sam3d/notebook/demo_multi_object.ipynb ADDED Viewed

	@@ -0,0 +1,162 @@

+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Copyright (c) Meta Platforms, Inc. and affiliates."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 1. Imports and Model Loading"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import uuid\n",
+                "import imageio\n",
+                "import numpy as np\n",
+                "from IPython.display import Image as ImageDisplay\n",
+                "\n",
+                "from inference import Inference, ready_gaussian_for_video_rendering, load_image, load_masks, display_image, make_scene, render_video, interactive_visualizer"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "PATH = os.getcwd()\n",
+                "TAG = \"hf\"\n",
+                "config_path = f\"{PATH}/../checkpoints/{TAG}/pipeline.yaml\"\n",
+                "inference = Inference(config_path, compile=False)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 2. Load input image to lift to 3D (multiple objects)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "IMAGE_PATH = f\"{PATH}/images/shutterstock_stylish_kidsroom_1640806567/image.png\"\n",
+                "IMAGE_NAME = os.path.basename(os.path.dirname(IMAGE_PATH))\n",
+                "\n",
+                "image = load_image(IMAGE_PATH)\n",
+                "masks = load_masks(os.path.dirname(IMAGE_PATH), extension=\".png\")\n",
+                "display_image(image, masks)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 3. Generate Gaussian Splats"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "outputs = [inference(image, mask, seed=42) for mask in masks]"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 4. Visualize Gaussian Splat of the Scene\n",
+                "### a. Animated Gif"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "scene_gs = make_scene(*outputs)\n",
+                "scene_gs = ready_gaussian_for_video_rendering(scene_gs)\n",
+                "\n",
+                "# export gaussian splatting (as point cloud)\n",
+                "scene_gs.save_ply(f\"{PATH}/gaussians/multi/{IMAGE_NAME}.ply\")\n",
+                "\n",
+                "video = render_video(\n",
+                "    scene_gs,\n",
+                "    r=1,\n",
+                "    fov=60,\n",
+                "    resolution=512,\n",
+                ")[\"color\"]\n",
+                "\n",
+                "# save video as gif\n",
+                "imageio.mimsave(\n",
+                "    os.path.join(f\"{PATH}/gaussians/multi/{IMAGE_NAME}.gif\"),\n",
+                "    video,\n",
+                "    format=\"GIF\",\n",
+                "    duration=1000 / 30,  # default assuming 30fps from the input MP4\n",
+                "    loop=0,  # 0 means loop indefinitely\n",
+                ")\n",
+                "\n",
+                "# notebook display\n",
+                "ImageDisplay(url=f\"gaussians/multi/{IMAGE_NAME}.gif?cache_invalidator={uuid.uuid4()}\",)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### b. Interactive Visualizer"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# might take a while to load (black screen)\n",
+                "interactive_visualizer(f\"{PATH}/gaussians/multi/{IMAGE_NAME}.ply\")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "sam3d-objects",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.0"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

thirdparty/sam3d/sam3d/notebook/demo_single_object.ipynb ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Copyright (c) Meta Platforms, Inc. and affiliates."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 1. Imports and Model Loading"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import imageio\n",
+                "import uuid\n",
+                "from IPython.display import Image as ImageDisplay\n",
+                "from inference import Inference, ready_gaussian_for_video_rendering, render_video, load_image, load_single_mask, display_image, make_scene, interactive_visualizer"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "PATH = os.getcwd()\n",
+                "TAG = \"hf\"\n",
+                "config_path = f\"{PATH}/../checkpoints/{TAG}/pipeline.yaml\"\n",
+                "inference = Inference(config_path, compile=False)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 2. Load input image to lift to 3D (single object)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "IMAGE_PATH = f\"{PATH}/images/shutterstock_stylish_kidsroom_1640806567/image.png\"\n",
+                "IMAGE_NAME = os.path.basename(os.path.dirname(IMAGE_PATH))\n",
+                "\n",
+                "image = load_image(IMAGE_PATH)\n",
+                "mask = load_single_mask(os.path.dirname(IMAGE_PATH), index=14)\n",
+                "display_image(image, masks=[mask])"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 3. Generate Gaussian Splat"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# run model\n",
+                "output = inference(image, mask, seed=42)\n",
+                "\n",
+                "# export gaussian splat (as point cloud)\n",
+                "output[\"gs\"].save_ply(f\"{PATH}/gaussians/single/{IMAGE_NAME}.ply\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## 4. Visualize Gaussian Splat\n",
+                "### a. Animated Gif"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# render gaussian splat\n",
+                "scene_gs = make_scene(output)\n",
+                "scene_gs = ready_gaussian_for_video_rendering(scene_gs)\n",
+                "\n",
+                "video = render_video(\n",
+                "    scene_gs,\n",
+                "    r=1,\n",
+                "    fov=60,\n",
+                "    pitch_deg=15,\n",
+                "    yaw_start_deg=-45,\n",
+                "    resolution=512,\n",
+                ")[\"color\"]\n",
+                "\n",
+                "# save video as gif\n",
+                "imageio.mimsave(\n",
+                "    os.path.join(f\"{PATH}/gaussians/single/{IMAGE_NAME}.gif\"),\n",
+                "    video,\n",
+                "    format=\"GIF\",\n",
+                "    duration=1000 / 30,  # default assuming 30fps from the input MP4\n",
+                "    loop=0,  # 0 means loop indefinitely\n",
+                ")\n",
+                "\n",
+                "# notebook display\n",
+                "ImageDisplay(url=f\"gaussians/single/{IMAGE_NAME}.gif?cache_invalidator={uuid.uuid4()}\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### b. Interactive Visualizer"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# might take a while to load (black screen)\n",
+                "interactive_visualizer(f\"{PATH}/gaussians/single/{IMAGE_NAME}.ply\")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "sam3d_objects-3dfy",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.0"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

thirdparty/sam3d/sam3d/notebook/inference.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+# not ideal to put that here
+os.environ["CUDA_HOME"] = os.environ["CONDA_PREFIX"]
+os.environ["LIDRA_SKIP_INIT"] = "true"
+import sys
+from typing import Union, Optional, List, Callable
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf, DictConfig, ListConfig
+from hydra.utils import instantiate, get_method
+import torch
+import math
+import utils3d
+import shutil
+import subprocess
+import seaborn as sns
+from PIL import Image
+import numpy as np
+import gradio as gr
+import matplotlib.pyplot as plt
+from copy import deepcopy
+from kaolin.visualize import IpyTurntableVisualizer
+from kaolin.render.camera import Camera, CameraExtrinsics, PinholeIntrinsics
+import builtins
+from pytorch3d.transforms import quaternion_multiply, quaternion_invert
+import sam3d_objects  # REMARK(Pierre) : do not remove this import
+from sam3d_objects.pipeline.inference_pipeline_pointmap import InferencePipelinePointMap
+from sam3d_objects.model.backbone.tdfy_dit.utils import render_utils
+from sam3d_objects.utils.visualization import SceneVisualizer
+__all__ = ["Inference"]
+WHITELIST_FILTERS = [
+    lambda target: target.split(".", 1)[0] in {"sam3d_objects", "torch", "torchvision", "moge"},
+]
+BLACKLIST_FILTERS = [
+    lambda target: get_method(target)
+    in {
+        builtins.exec,
+        builtins.eval,
+        builtins.__import__,
+        os.kill,
+        os.system,
+        os.putenv,
+        os.remove,
+        os.removedirs,
+        os.rmdir,
+        os.fchdir,
+        os.setuid,
+        os.fork,
+        os.forkpty,
+        os.killpg,
+        os.rename,
+        os.renames,
+        os.truncate,
+        os.replace,
+        os.unlink,
+        os.fchmod,
+        os.fchown,
+        os.chmod,
+        os.chown,
+        os.chroot,
+        os.fchdir,
+        os.lchown,
+        os.getcwd,
+        os.chdir,
+        shutil.rmtree,
+        shutil.move,
+        shutil.chown,
+        subprocess.Popen,
+        builtins.help,
+    },
+]
+class Inference:
+    # public facing inference API
+    # only put publicly exposed arguments here
+    def __init__(self, config_file: str, compile: bool = False):
+        # load inference pipeline
+        config = OmegaConf.load(config_file)
+        config.rendering_engine = "pytorch3d"  # overwrite to disable nvdiffrast
+        config.compile_model = compile
+        config.workspace_dir = os.path.dirname(config_file)
+        check_hydra_safety(config, WHITELIST_FILTERS, BLACKLIST_FILTERS)
+        self._pipeline: InferencePipelinePointMap = instantiate(config)
+    def merge_mask_to_rgba(self, image, mask):
+        mask = mask.astype(np.uint8) * 255
+        mask = mask[..., None]
+        # embed mask in alpha channel
+        rgba_image = np.concatenate([image[..., :3], mask], axis=-1)
+        return rgba_image
+    def __call__(
+        self,
+        image: Union[Image.Image, np.ndarray],
+        mask: Optional[Union[None, Image.Image, np.ndarray]],
+        seed: Optional[int] = None,
+        pointmap=None,
+    ) -> dict:
+        image = self.merge_mask_to_rgba(image, mask)
+        return self._pipeline.run(
+            image,
+            None,
+            seed,
+            stage1_only=False,
+            with_mesh_postprocess=False,
+            with_texture_baking=False,
+            with_layout_postprocess=True,
+            use_vertex_color=True,
+            stage1_inference_steps=None,
+            pointmap=pointmap,
+        )
+def _yaw_pitch_r_fov_to_extrinsics_intrinsics(yaws, pitchs, rs, fovs):
+    is_list = isinstance(yaws, list)
+    if not is_list:
+        yaws = [yaws]
+        pitchs = [pitchs]
+    if not isinstance(rs, list):
+        rs = [rs] * len(yaws)
+    if not isinstance(fovs, list):
+        fovs = [fovs] * len(yaws)
+    extrinsics = []
+    intrinsics = []
+    for yaw, pitch, r, fov in zip(yaws, pitchs, rs, fovs):
+        fov = torch.deg2rad(torch.tensor(float(fov))).cuda()
+        yaw = torch.tensor(float(yaw)).cuda()
+        pitch = torch.tensor(float(pitch)).cuda()
+        orig = (
+            torch.tensor(
+                [
+                    torch.sin(yaw) * torch.cos(pitch),
+                    torch.sin(pitch),
+                    torch.cos(yaw) * torch.cos(pitch),
+                ]
+            ).cuda()
+            * r
+        )
+        extr = utils3d.torch.extrinsics_look_at(
+            orig,
+            torch.tensor([0, 0, 0]).float().cuda(),
+            torch.tensor([0, 1, 0]).float().cuda(),
+        )
+        intr = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
+        extrinsics.append(extr)
+        intrinsics.append(intr)
+    if not is_list:
+        extrinsics = extrinsics[0]
+        intrinsics = intrinsics[0]
+    return extrinsics, intrinsics
+def render_video(
+    sample,
+    resolution=512,
+    bg_color=(0, 0, 0),
+    num_frames=300,
+    r=2.0,
+    fov=40,
+    pitch_deg=0,
+    yaw_start_deg=-90,
+    **kwargs,
+):
+    yaws = (
+        torch.linspace(0, 2 * torch.pi, num_frames) + math.radians(yaw_start_deg)
+    ).tolist()
+    pitch = [math.radians(pitch_deg)] * num_frames
+    extr, intr = _yaw_pitch_r_fov_to_extrinsics_intrinsics(yaws, pitch, r, fov)
+    return render_utils.render_frames(
+        sample,
+        extr,
+        intr,
+        {"resolution": resolution, "bg_color": bg_color, "backend": "gsplat"},
+        **kwargs,
+    )
+def ready_gaussian_for_video_rendering(scene_gs, in_place=False, fix_alignment=False):
+    if fix_alignment:
+        scene_gs = _fix_gaussian_alignment(scene_gs, in_place=in_place)
+    scene_gs = normalized_gaussian(scene_gs, in_place=fix_alignment)
+    return scene_gs
+def _fix_gaussian_alignment(scene_gs, in_place=False):
+    if not in_place:
+        scene_gs = deepcopy(scene_gs)
+    device = scene_gs._xyz.device
+    dtype = scene_gs._xyz.dtype
+    scene_gs._xyz = (
+        scene_gs._xyz
+        @ torch.tensor(
+            [
+                [-1, 0, 0],
+                [0, 0, 1],
+                [0, 1, 0],
+            ],
+            device=device,
+            dtype=dtype,
+        ).T
+    )
+    return scene_gs
+def normalized_gaussian(scene_gs, in_place=False, outlier_percentile=None):
+    if not in_place:
+        scene_gs = deepcopy(scene_gs)
+    orig_xyz = scene_gs.get_xyz
+    orig_scale = scene_gs.get_scaling
+    active_mask = (scene_gs.get_opacity > 0.9).squeeze()
+    inv_scale = (
+        orig_xyz[active_mask].max(dim=0)[0] - orig_xyz[active_mask].min(dim=0)[0]
+    ).max()
+    norm_scale = orig_scale / inv_scale
+    norm_xyz = orig_xyz / inv_scale
+    if outlier_percentile is None:
+        lower_bound_xyz = torch.min(norm_xyz[active_mask], dim=0)[0]
+        upper_bound_xyz = torch.max(norm_xyz[active_mask], dim=0)[0]
+    else:
+        lower_bound_xyz = torch.quantile(
+            norm_xyz[active_mask],
+            outlier_percentile,
+            dim=0,
+        )
+        upper_bound_xyz = torch.quantile(
+            norm_xyz[active_mask],
+            1.0 - outlier_percentile,
+            dim=0,
+        )
+    center = (lower_bound_xyz + upper_bound_xyz) / 2
+    norm_xyz = norm_xyz - center
+    scene_gs.from_xyz(norm_xyz)
+    scene_gs.mininum_kernel_size /= inv_scale.item()
+    scene_gs.from_scaling(norm_scale)
+    return scene_gs
+def make_scene(*outputs, in_place=False):
+    if not in_place:
+        outputs = [deepcopy(output) for output in outputs]
+    all_outs = []
+    minimum_kernel_size = float("inf")
+    for output in outputs:
+        # move gaussians to scene frame of reference
+        PC = SceneVisualizer.object_pointcloud(
+            points_local=output["gaussian"][0].get_xyz.unsqueeze(0),
+            quat_l2c=output["rotation"],
+            trans_l2c=output["translation"],
+            scale_l2c=output["scale"],
+        )
+        output["gaussian"][0].from_xyz(PC.points_list()[0])
+        # must ... ROTATE
+        output["gaussian"][0].from_rotation(
+            quaternion_multiply(
+                quaternion_invert(output["rotation"]),
+                output["gaussian"][0].get_rotation,
+            )
+        )
+        scale = output["gaussian"][0].get_scaling
+        adjusted_scale = scale * output["scale"]
+        assert (
+            output["scale"][0, 0].item()
+            == output["scale"][0, 1].item()
+            == output["scale"][0, 2].item()
+        )
+        output["gaussian"][0].mininum_kernel_size *= output["scale"][0, 0].item()
+        adjusted_scale = torch.maximum(
+            adjusted_scale,
+            torch.tensor(
+                output["gaussian"][0].mininum_kernel_size * 1.1,
+                device=adjusted_scale.device,
+            ),
+        )
+        output["gaussian"][0].from_scaling(adjusted_scale)
+        minimum_kernel_size = min(
+            minimum_kernel_size,
+            output["gaussian"][0].mininum_kernel_size,
+        )
+        all_outs.append(output)
+    # merge gaussians
+    scene_gs = all_outs[0]["gaussian"][0]
+    scene_gs.mininum_kernel_size = minimum_kernel_size
+    for out in all_outs[1:]:
+        out_gs = out["gaussian"][0]
+        scene_gs._xyz = torch.cat([scene_gs._xyz, out_gs._xyz], dim=0)
+        scene_gs._features_dc = torch.cat(
+            [scene_gs._features_dc, out_gs._features_dc], dim=0
+        )
+        scene_gs._scaling = torch.cat([scene_gs._scaling, out_gs._scaling], dim=0)
+        scene_gs._rotation = torch.cat([scene_gs._rotation, out_gs._rotation], dim=0)
+        scene_gs._opacity = torch.cat([scene_gs._opacity, out_gs._opacity], dim=0)
+    return scene_gs
+def check_target(
+    target: str,
+    whitelist_filters: List[Callable],
+    blacklist_filters: List[Callable],
+):
+    if any(filt(target) for filt in whitelist_filters):
+        if not any(filt(target) for filt in blacklist_filters):
+            return
+    raise RuntimeError(
+        f"target '{target}' is not allowed to be hydra instantiated, if this is a mistake, please do modify the whitelist_filters / blacklist_filters"
+    )
+def check_hydra_safety(
+    config: DictConfig,
+    whitelist_filters: List[Callable],
+    blacklist_filters: List[Callable],
+):
+    to_check = [config]
+    while len(to_check) > 0:
+        node = to_check.pop()
+        if isinstance(node, DictConfig):
+            to_check.extend(list(node.values()))
+            if "_target_" in node:
+                check_target(node["_target_"], whitelist_filters, blacklist_filters)
+        elif isinstance(node, ListConfig):
+            to_check.extend(list(node))
+def load_image(path):
+    image = Image.open(path)
+    image = np.array(image)
+    image = image.astype(np.uint8)
+    return image
+def load_mask(path):
+    mask = load_image(path)
+    mask = mask > 0
+    if mask.ndim == 3:
+        mask = mask[..., -1]
+    return mask
+def load_single_mask(folder_path, index=0, extension=".png"):
+    masks = load_masks(folder_path, [index], extension)
+    return masks[0]
+def load_masks(folder_path, indices_list=None, extension=".png"):
+    masks = []
+    indices_list = [] if indices_list is None else list(indices_list)
+    if not len(indices_list) > 0:  # get all all masks if not provided
+        idx = 0
+        while os.path.exists(os.path.join(folder_path, f"{idx}{extension}")):
+            indices_list.append(idx)
+            idx += 1
+    for idx in indices_list:
+        mask_path = os.path.join(folder_path, f"{idx}{extension}")
+        assert os.path.exists(mask_path), f"Mask path {mask_path} does not exist"
+        mask = load_mask(mask_path)
+        masks.append(mask)
+    return masks
+def display_image(image, masks=None):
+    def imshow(image, ax):
+        ax.axis("off")
+        ax.imshow(image)
+    grid = (1, 1) if masks is None else (2, 2)
+    fig, axes = plt.subplots(*grid)
+    if masks is not None:
+        mask_colors = sns.color_palette("husl", len(masks))
+        black_image = np.zeros_like(image[..., :3], dtype=float)  # background
+        mask_display = np.copy(black_image)
+        mask_union = np.zeros_like(image[..., :3])
+        for i, mask in enumerate(masks):
+            mask_display[mask] = mask_colors[i]
+            mask_union |= mask[..., None] if mask.ndim == 2 else mask
+        imshow(black_image, axes[0, 1])
+        imshow(mask_display, axes[1, 0])
+        imshow(image * mask_union, axes[1, 1])
+    image_axe = axes if masks is None else axes[0, 0]
+    imshow(image, image_axe)
+    fig.tight_layout(pad=0)
+    fig.show()
+def interactive_visualizer(ply_path):
+    with gr.Blocks() as demo:
+        gr.Markdown("# 3D Gaussian Splatting (black-screen loading might take a while)")
+        gr.Model3D(
+            value=ply_path,  # splat file
+            label="3D Scene",
+        )
+    demo.launch(share=True)

thirdparty/sam3d/sam3d/notebook/mesh_alignment.py ADDED Viewed

	@@ -0,0 +1,469 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+SAM 3D Body (3DB) Mesh Alignment Utilities
+Handles alignment of 3DB meshes to SAM 3D Object, same as MoGe point cloud scale.
+"""
+import os
+import math
+import json
+import numpy as np
+import torch
+import trimesh
+from PIL import Image
+import torch.nn.functional as F
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import PerspectiveCameras, RasterizationSettings, MeshRasterizer, TexturesVertex
+from moge.model.v1 import MoGeModel
+def load_3db_mesh(mesh_path, device='cuda'):
+    """Load 3DB mesh and convert from OpenGL to PyTorch3D coordinates."""
+    mesh = trimesh.load(mesh_path)
+    vertices = np.array(mesh.vertices)
+    faces = np.array(mesh.faces)
+    # Convert from OpenGL to PyTorch3D coordinates
+    vertices[:, 0] *= -1  # Flip X
+    vertices[:, 2] *= -1  # Flip Z
+    vertices = torch.from_numpy(vertices).float().to(device)
+    faces = torch.from_numpy(faces).long().to(device)
+    return vertices, faces
+def get_moge_pointcloud(image_tensor, device='cuda'):
+    """Generate MoGe point cloud from image tensor."""
+    moge_model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(device)
+    moge_model.eval()
+    with torch.no_grad():
+        moge_output = moge_model.infer(image_tensor)
+    return moge_output
+def denormalize_intrinsics(norm_K, height, width):
+    """Convert normalized intrinsics to absolute pixel coordinates."""
+    cx_norm, cy_norm = norm_K[0, 2], norm_K[1, 2]
+    fx_norm, fy_norm = norm_K[0, 0], norm_K[1, 1]
+    fx_abs = fx_norm * width
+    fy_abs = fy_norm * height
+    cx_abs = cx_norm * width
+    cy_abs = cy_norm * height
+    fx_abs = fy_abs
+    return np.array([
+        [fx_abs, 0.0, cx_abs],
+        [0.0, fy_abs, cy_abs],
+        [0.0, 0.0, 1.0]
+    ])
+def crop_mesh_with_mask(vertices, faces, focal_length, mask, device='cuda'):
+    """Crop mesh vertices to only those visible in the mask."""
+    textures = TexturesVertex(verts_features=torch.ones_like(vertices)[None])
+    mesh = Meshes(verts=[vertices], faces=[faces], textures=textures)
+    H, W = mask.shape[-2:]
+    fx = fy = focal_length
+    cx, cy = W / 2.0, H / 2.0
+    camera = PerspectiveCameras(
+        focal_length=((fx, fy),),
+        principal_point=((cx, cy),),
+        image_size=((H, W),),
+        in_ndc=False, device=device
+    )
+    raster_settings = RasterizationSettings(
+        image_size=(H, W), blur_radius=0.0, faces_per_pixel=1,
+        cull_backfaces=False, bin_size=0,
+    )
+    rasterizer = MeshRasterizer(cameras=camera, raster_settings=raster_settings)
+    fragments = rasterizer(mesh)
+    face_indices = fragments.pix_to_face[0, ..., 0]  # (H, W)
+    visible_mask = (mask > 0) & (face_indices >= 0)
+    visible_face_ids = face_indices[visible_mask]
+    visible_faces = faces[visible_face_ids]
+    visible_vert_ids = torch.unique(visible_faces)
+    verts_cropped = vertices[visible_vert_ids]
+    return verts_cropped, visible_mask
+def extract_target_points(pointmap, visible_mask):
+    """Extract target points from MoGe pointmap using visible mask."""
+    target_points = pointmap[visible_mask.bool()]
+    # Convert from MoGe coordinates to PyTorch3D coordinates
+    target_points[:, 0] *= -1
+    target_points[:, 1] *= -1
+    # Remove flying points using adaptive quantile filtering
+    z_range = torch.max(target_points[:, 2]) - torch.min(target_points[:, 2])
+    if z_range > 6.0:
+        thresh = 0.90
+    elif z_range > 2.0:
+        thresh = 0.93
+    else:
+        thresh = 0.95
+    depth_quantile = torch.quantile(target_points[:, 2], thresh)
+    target_points = target_points[target_points[:, 2] <= depth_quantile]
+    # Remove infinite values
+    finite_mask = torch.isfinite(target_points).all(dim=1)
+    target_points = target_points[finite_mask]
+    return target_points
+def align_mesh_to_pointcloud(vertices, target_points):
+    """Align mesh vertices to target point cloud using scale and translation."""
+    if target_points.shape[0] == 0:
+        print("[WARNING] No target points for alignment!")
+        return vertices, torch.tensor(1.0), torch.zeros(3)
+    # Scale alignment based on height
+    height_src = torch.max(vertices[:, 1]) - torch.min(vertices[:, 1])
+    height_tgt = torch.max(target_points[:, 1]) - torch.min(target_points[:, 1])
+    scale_factor = height_tgt / height_src
+    vertices_scaled = vertices * scale_factor
+    # Translation alignment based on centers
+    center_src = torch.mean(vertices_scaled, dim=0)
+    center_tgt = torch.mean(target_points, dim=0)
+    translation = center_tgt - center_src
+    vertices_aligned = vertices_scaled + translation
+    return vertices_aligned, scale_factor, translation
+def load_mask_for_alignment(mask_path):
+    """Load mask image as numpy array."""
+    mask = Image.open(mask_path).convert('L')
+    mask_array = np.array(mask) / 255.0
+    return mask_array
+def load_focal_length_from_json(json_path):
+    """Load focal length from JSON file."""
+    try:
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+        focal_length = data.get('focal_length')
+        if focal_length is None:
+            raise ValueError("'focal_length' key not found in JSON file")
+        print(f"[INFO] Loaded focal length from {json_path}: {focal_length}")
+        return focal_length
+    except Exception as e:
+        print(f"[ERROR] Failed to load focal length from {json_path}: {e}")
+        raise
+def process_3db_alignment(mesh_path, mask_path, image_path, device='cuda', focal_length_json_path=None):
+    """Complete pipeline for aligning 3DB mesh to MoGe scale."""
+    print(f"[INFO] Processing alignment...")
+    # Load input data
+    vertices, faces = load_3db_mesh(mesh_path, device)
+    # Load and preprocess image
+    image = Image.open(image_path).convert('RGB')
+    image_tensor = torch.from_numpy(np.array(image)).float().permute(2, 0, 1) / 255.0
+    image_tensor = image_tensor.to(device)
+    # Load mask and resize to match image
+    H, W = image_tensor.shape[1:]
+    mask = load_mask_for_alignment(mask_path)
+    if mask.shape != (H, W):
+        mask = Image.fromarray((mask * 255).astype(np.uint8))
+        mask = mask.resize((W, H), Image.NEAREST)
+        mask = np.array(mask) / 255.0
+    mask = torch.from_numpy(mask).float().to(device)
+    # Generate MoGe point cloud
+    print("[INFO] Generating MoGe point cloud...")
+    moge_output = get_moge_pointcloud(image_tensor, device)
+    # Load focal length from JSON if provided, otherwise compute from MoGe intrinsics
+    if focal_length_json_path is not None:
+        focal_length = load_focal_length_from_json(focal_length_json_path)
+    else:
+        # Compute camera parameters from MoGe intrinsics (fallback)
+        intrinsics = denormalize_intrinsics(moge_output['intrinsics'].cpu().numpy(), H, W)
+        focal_length = intrinsics[1, 1]  # Use fy
+        print(f"[INFO] Using computed focal length from MoGe: {focal_length}")
+    # Crop mesh using mask
+    print("[INFO] Cropping mesh with mask...")
+    verts_cropped, visible_mask = crop_mesh_with_mask(vertices, faces, focal_length, mask, device)
+    # Extract target points from MoGe
+    print("[INFO] Extracting target points...")
+    target_points = extract_target_points(moge_output['points'], visible_mask)
+    if target_points.shape[0] == 0:
+        print("[ERROR] No valid target points found!")
+        return None
+    # Perform alignment
+    print("[INFO] Aligning mesh to point cloud...")
+    aligned_vertices, scale_factor, translation = align_mesh_to_pointcloud(verts_cropped, target_points)
+    # Apply alignment to full mesh
+    full_aligned_vertices = (vertices * scale_factor) + translation
+    # Convert back to OpenGL coordinates for final output
+    final_vertices_opengl = full_aligned_vertices.cpu().numpy()
+    final_vertices_opengl[:, 0] *= -1
+    final_vertices_opengl[:, 2] *= -1
+    results = {
+        'aligned_vertices_opengl': final_vertices_opengl,
+        'faces': faces.cpu().numpy(),
+        'scale_factor': scale_factor.item(),
+        'translation': translation.cpu().numpy(),
+        'focal_length': focal_length,
+        'target_points_count': target_points.shape[0],
+        'cropped_vertices_count': verts_cropped.shape[0]
+    }
+    print(f"[INFO] Alignment completed - Scale: {scale_factor.item():.4f}, Target points: {target_points.shape[0]}")
+    return results
+def process_and_save_alignment(mesh_path, mask_path, image_path, output_dir, device='cuda', focal_length_json_path=None):
+    """
+    Complete pipeline for processing 3DB alignment and saving the result.
+    Args:
+        mesh_path: Path to input 3DB mesh (.ply)
+        mask_path: Path to mask image (.png)
+        image_path: Path to input image (.jpg)
+        output_dir: Directory to save aligned mesh
+        device: Device to use ('cuda' or 'cpu')
+        focal_length_json_path: Optional path to focal length JSON file
+    Returns:
+        tuple: (success: bool, output_mesh_path: str or None, result_info: dict or None)
+    """
+    try:
+        print("[INFO] Starting 3DB mesh alignment pipeline...")
+        # Ensure output directory exists
+        os.makedirs(output_dir, exist_ok=True)
+        # Process alignment
+        result = process_3db_alignment(
+            mesh_path=mesh_path,
+            mask_path=mask_path,
+            image_path=image_path,
+            device=device,
+            focal_length_json_path=focal_length_json_path
+        )
+        if result is not None:
+            # Save aligned mesh
+            output_mesh_path = os.path.join(output_dir, 'human_aligned.ply')
+            aligned_mesh = trimesh.Trimesh(
+                vertices=result['aligned_vertices_opengl'],
+                faces=result['faces']
+            )
+            aligned_mesh.export(output_mesh_path)
+            print(f" SUCCESS! Saved aligned mesh to: {output_mesh_path}")
+            return True, output_mesh_path, result
+        else:
+            print(" ERROR: Failed to process mesh alignment")
+            return False, None, None
+    except Exception as e:
+        print(f" ERROR: Exception during processing: {e}")
+        import traceback
+        traceback.print_exc()
+        return False, None, None
+    finally:
+        print(" Processing complete!")
+def visualize_meshes_interactive(aligned_mesh_path, dfy_mesh_path, output_dir=None, share=True, height=600):
+    """
+    Interactive Gradio-based 3D visualization of aligned human and object meshes.
+    Args:
+        aligned_mesh_path: Path to aligned mesh PLY file
+        dfy_mesh_path: Path to 3Dfy GLB file
+        output_dir: Directory to save combined GLB file (defaults to same dir as aligned_mesh_path)
+        share: Whether to create a public shareable link (default: True)
+        height: Height of the 3D viewer in pixels (default: 600)
+    Returns:
+        tuple: (demo, combined_glb_path) - Gradio demo object and path to combined GLB file
+    """
+    import gradio as gr
+    print("Loading meshes for interactive visualization...")
+    try:
+        # Load aligned mesh (PLY)
+        aligned_mesh = trimesh.load(aligned_mesh_path)
+        print(f"Loaded aligned mesh: {len(aligned_mesh.vertices)} vertices")
+        # Load 3Dfy mesh (GLB - handle scene structure)
+        dfy_scene = trimesh.load(dfy_mesh_path)
+        if hasattr(dfy_scene, 'dump'):  # It's a scene
+            dfy_meshes = [geom for geom in dfy_scene.geometry.values() if hasattr(geom, 'vertices')]
+            if len(dfy_meshes) == 1:
+                dfy_mesh = dfy_meshes[0]
+            elif len(dfy_meshes) > 1:
+                dfy_mesh = trimesh.util.concatenate(dfy_meshes)
+            else:
+                raise ValueError("No valid meshes in GLB file")
+        else:
+            dfy_mesh = dfy_scene
+        print(f"Loaded 3Dfy mesh: {len(dfy_mesh.vertices)} vertices")
+        # Create combined scene
+        scene = trimesh.Scene()
+        # Add both meshes with different colors
+        aligned_copy = aligned_mesh.copy()
+        aligned_copy.visual.vertex_colors = [255, 0, 0, 200]  # Red for aligned human
+        scene.add_geometry(aligned_copy, node_name="sam3d_aligned_human")
+        dfy_copy = dfy_mesh.copy()
+        dfy_copy.visual.vertex_colors = [0, 0, 255, 200]  # Blue for 3Dfy object
+        scene.add_geometry(dfy_copy, node_name="dfy_object")
+        # Determine output path
+        if output_dir is None:
+            output_dir = os.path.dirname(aligned_mesh_path)
+        os.makedirs(output_dir, exist_ok=True)
+        combined_glb_path = os.path.join(output_dir, 'combined_scene.glb')
+        scene.export(combined_glb_path)
+        print(f"Exported combined scene to: {combined_glb_path}")
+        # Create interactive Gradio viewer
+        with gr.Blocks() as demo:
+            gr.Markdown("# 3D Mesh Alignment Visualization")
+            gr.Markdown("**Red**: SAM 3D Body Aligned Human | **Blue**: 3Dfy Object")
+            gr.Model3D(
+                value=combined_glb_path,
+                label="Combined 3D Scene (Interactive)",
+                height=height
+            )
+        # Launch the viewer
+        print("Launching interactive 3D viewer...")
+        demo.launch(share=share)
+        return demo, combined_glb_path
+    except Exception as e:
+        print(f"ERROR in visualization: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None
+def visualize_meshes_comparison(aligned_mesh_path, dfy_mesh_path, use_interactive=False):
+    """
+    Simple visualization of both meshes in a single 3D plot.
+    DEPRECATED: Use visualize_meshes_interactive() for better interactive visualization.
+    Args:
+        aligned_mesh_path: Path to aligned mesh PLY file
+        dfy_mesh_path: Path to 3Dfy GLB file
+        use_interactive: Whether to attempt trimesh scene viewer (default: False)
+    Returns:
+        tuple: (aligned_mesh, dfy_mesh) trimesh objects or (None, None) if failed
+    """
+    import matplotlib.pyplot as plt
+    print("Loading meshes for visualization...")
+    try:
+        # Load aligned mesh (PLY)
+        aligned_mesh = trimesh.load(aligned_mesh_path)
+        print(f"Loaded aligned mesh: {len(aligned_mesh.vertices)} vertices")
+        # Load 3Dfy mesh (GLB - handle scene structure)
+        dfy_scene = trimesh.load(dfy_mesh_path)
+        if hasattr(dfy_scene, 'dump'):  # It's a scene
+            dfy_meshes = [geom for geom in dfy_scene.geometry.values() if hasattr(geom, 'vertices')]
+            if len(dfy_meshes) == 1:
+                dfy_mesh = dfy_meshes[0]
+            elif len(dfy_meshes) > 1:
+                dfy_mesh = trimesh.util.concatenate(dfy_meshes)
+            else:
+                raise ValueError("No valid meshes in GLB file")
+        else:
+            dfy_mesh = dfy_scene
+        print(f"Loaded 3Dfy mesh: {len(dfy_mesh.vertices)} vertices")
+        # Create single 3D plot with both meshes
+        fig = plt.figure(figsize=(12, 10))
+        ax = fig.add_subplot(111, projection='3d')
+        # Plot both meshes in the same space
+        ax.scatter(dfy_mesh.vertices[:, 0],
+                   dfy_mesh.vertices[:, 1],
+                   dfy_mesh.vertices[:, 2],
+                   c='blue', s=0.1, alpha=0.6, label='3Dfy Original')
+        ax.scatter(aligned_mesh.vertices[:, 0],
+                   aligned_mesh.vertices[:, 1],
+                   aligned_mesh.vertices[:, 2],
+                   c='red', s=0.1, alpha=0.6, label='SAM 3D Body Aligned')
+        ax.set_title('Mesh Comparison: 3Dfy vs SAM 3D Body Aligned', fontsize=16, fontweight='bold')
+        ax.set_xlabel('X')
+        ax.set_ylabel('Y')
+        ax.set_zlabel('Z')
+        ax.legend()
+        plt.tight_layout()
+        plt.show()
+        # Optional trimesh scene viewer
+        if use_interactive:
+            try:
+                print("Creating trimesh scene...")
+                scene = trimesh.Scene()
+                # Add both meshes with different colors
+                aligned_copy = aligned_mesh.copy()
+                aligned_copy.visual.vertex_colors = [255, 0, 0, 200]  # Red
+                scene.add_geometry(aligned_copy, node_name="sam3d_aligned")
+                dfy_copy = dfy_mesh.copy()
+                dfy_copy.visual.vertex_colors = [0, 0, 255, 200]  # Blue
+                scene.add_geometry(dfy_copy, node_name="dfy_original")
+                print("Opening interactive trimesh viewer...")
+                scene.show()
+            except Exception as e:
+                print(f"Trimesh viewer not available: {e}")
+        print("Visualization complete")
+        return aligned_mesh, dfy_mesh
+    except Exception as e:
+        print(f"ERROR in visualization: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None

thirdparty/sam3d/sam3d/patching/hydra ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python
+import os
+import hydra
+import urllib.request
+if hydra.__version__ != "1.3.2":
+    raise RuntimeError("different hydra version has been found, cannot patch")
+hydra_root = os.path.dirname(hydra.__file__)
+utils_path = os.path.join(hydra_root, "core", "utils.py")
+urllib.request.urlretrieve(
+    "https://raw.githubusercontent.com/gleize/hydra/78f00766b5f37672aa7232ebbf01bdd74246bd60/hydra/core/utils.py",
+    utils_path,
+)

thirdparty/sam3d/sam3d/pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+[tool.hatch.envs.default.env-vars]
+PIP_EXTRA_INDEX_URL = "https://pypi.ngc.nvidia.com https://download.pytorch.org/whl/cu121"
+[tool.hatch.metadata]
+# for git-referenced dependencies
+allow-direct-references = true
+[project]
+name = "sam3d_objects"
+version = "0.0.1"
+# required for "hatch-requirements-txt" to work
+dynamic = ["dependencies", "optional-dependencies"]
+[tool.hatch.build]
+ignore-vcs = true
+include = ["**/*.py"]
+exclude = ["conftest.py", "*_test.py"]
+packages = ["sam3d_objects"]
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+[tool.hatch.metadata.hooks.requirements_txt.optional-dependencies]
+p3d = ["requirements.p3d.txt"]
+inference = ["requirements.inference.txt"]
+dev = ["requirements.dev.txt"]

thirdparty/sam3d/sam3d/requirements.dev.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pytest
+findpydeps
+pipdeptree
+lovely_tensors

thirdparty/sam3d/sam3d/requirements.inference.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+kaolin==0.17.0
+gsplat @ git+https://github.com/nerfstudio-project/gsplat.git@2323de5905d5e90e035f792fe65bad0fedd413e7
+seaborn==0.13.2
+gradio==5.49.0

thirdparty/sam3d/sam3d/requirements.p3d.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git@75ebeeaea0908c5527e7b1e305fbc7681382db47
2	+ flash_attn==2.8.3

thirdparty/sam3d/sam3d/requirements.txt ADDED Viewed

	@@ -0,0 +1,88 @@

+astor==0.8.1
+async-timeout==4.0.3
+auto_gptq==0.7.1
+autoflake==2.3.1
+av==12.0.0
+bitsandbytes==0.43.0
+black==24.3.0
+bpy==4.3.0
+colorama==0.4.6
+conda-pack==0.7.1
+crcmod==1.7
+cuda-python==12.1.0
+dataclasses==0.6
+decord==0.6.0
+deprecation==2.1.0
+easydict==1.13
+einops-exts==0.0.4
+exceptiongroup==1.2.0
+fastavro==1.9.4
+fasteners==0.19
+flake8==7.0.0
+Flask==3.0.3
+fqdn==1.5.1
+ftfy==6.2.0
+fvcore==0.1.5.post20221221
+gdown==5.2.0
+h5py==3.12.1
+hdfs==2.7.3
+httplib2==0.22.0
+hydra-core==1.3.2
+hydra-submitit-launcher==1.2.0
+igraph==0.11.8
+imath==0.0.2
+isoduration==20.11.0
+jsonlines==4.0.0
+jsonpickle==3.0.4
+jsonpointer==2.4
+jupyter==1.1.1
+librosa==0.10.1
+lightning==2.3.3
+loguru==0.7.2
+mosaicml-streaming==0.7.5
+nvidia-cuda-nvcc-cu12==12.1.105
+nvidia-pyindex==1.0.9
+objsize==0.7.0
+open3d==0.18.0
+opencv-python==4.9.0.80
+OpenEXR==3.3.3
+optimum==1.18.1
+optree==0.14.1
+orjson==3.10.0
+panda3d-gltf==1.2.1
+pdoc3==0.10.0
+peft==0.10.0
+pip-system-certs==4.0
+point-cloud-utils==0.29.5
+polyscope==2.3.0
+pycocotools==2.0.7
+pydot==1.4.2
+pymeshfix==0.17.0
+pymongo==4.6.3
+pyrender==0.1.45
+PySocks==1.7.1
+pytest==8.1.1
+python-pycg==0.9.2
+randomname==0.2.1
+roma==1.5.1
+rootutils==1.0.7
+Rtree==1.3.0
+sagemaker==2.242.0
+scikit-image==0.23.1
+sentence-transformers==2.6.1
+simplejson==3.19.2
+smplx==0.1.28
+spconv-cu121==2.3.8
+tensorboard==2.16.2
+timm==0.9.16
+tomli==2.0.1
+torchaudio==2.5.1+cu121
+uri-template==1.3.0
+usort==1.0.8.post1
+wandb==0.20.0
+webcolors==1.13
+webdataset==0.2.86
+Werkzeug==3.0.6
+xatlas==0.0.9
+xformers==0.0.28.post3
+MoGe @ git+https://github.com/microsoft/MoGe.git@a8c37341bc0325ca99b9d57981cc3bb2bd3e255b

thirdparty/sam3d/sam3d/sam3d_objects/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+# Allow skipping initialization for lightweight tools
+if not os.environ.get('LIDRA_SKIP_INIT'):
+    import sam3d_objects.init

thirdparty/sam3d/sam3d/sam3d_objects/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/config/utils.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import functools
+from typing import Any, Callable, Union
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from hydra.utils import instantiate
+TargetType = Union[str, type, Callable[..., Any]]
+ClassOrCallableType = Union[type, Callable[..., Any]]
+def dump_config(config: DictConfig, path: str = "./config.yaml"):
+    txt = OmegaConf.to_yaml(config, sort_keys=True)
+    with open(path, "w") as f:
+        f.write(txt)
+def locate(path: str) -> Any:
+    if path == "":
+        raise ImportError("Empty path")
+    import builtins
+    from importlib import import_module
+    parts = [part for part in path.split(".") if part]
+    # load module part
+    module = None
+    for n in reversed(range(len(parts))):
+        try:
+            mod = ".".join(parts[:n])
+            module = import_module(mod)
+        except Exception as e:
+            if n == 0:
+                raise ImportError(f"Error loading module '{path}'") from e
+            continue
+        if module:
+            break
+    if module:
+        obj = module
+    else:
+        obj = builtins
+    # load object path in module
+    for part in parts[n:]:
+        mod = mod + "." + part
+        if not hasattr(obj, part):
+            try:
+                import_module(mod)
+            except Exception as e:
+                raise ImportError(
+                    f"Encountered error: `{e}` when loading module '{path}'"
+                ) from e
+        obj = getattr(obj, part)
+    return obj
+def full_instance_name(instance: Any) -> str:
+    return full_class_name(instance.__class__)
+def full_class_name(klass: Any) -> str:
+    module = klass.__module__
+    if module == "builtins":
+        return klass.__qualname__  # avoid outputs like 'builtins.str'
+    return module + "." + klass.__qualname__
+def ensure_is_subclass(child_class: type, parent_class: type) -> None:
+    if not issubclass(child_class, parent_class):
+        raise RuntimeError(
+            f"class {full_class_name(child_class)} should be a subclass of {full_class_name(parent_class)}"
+        )
+def find_class_or_callable_from_target(
+    target: TargetType,
+) -> ClassOrCallableType:
+    if isinstance(target, str):
+        obj = locate(target)
+    else:
+        obj = target
+    if (not isinstance(obj, type)) and (not callable(obj)):
+        raise ValueError(f"Invalid type ({type(obj)}) found for {target}")
+    return obj
+def find_and_ensure_is_subclass(target: TargetType, type_: type) -> ClassOrCallableType:
+    klass = find_class_or_callable_from_target(target)
+    ensure_is_subclass(klass, type_)
+    return klass
+class StrictPartial:
+    # remark : the `/` will handle the `path` argument name conflict (e.g. calling StrictPartial("a.b.c", ..., path="/a/b/c"))
+    def __init__(self, path, /, *args, **kwargs):
+        class_or_callable = find_class_or_callable_from_target(path)
+        self._partial = functools.partial(class_or_callable, *args, **kwargs)
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return self._partial(*args, **kwargs)
+class RecursivePartial:
+    @staticmethod
+    def replace_keys(config, key_mapping):
+        def recurse(data):
+            if isinstance(data, DictConfig):
+                new_data = {
+                    key_mapping[k] if k in key_mapping else k: recurse(v)
+                    for k, v in data.items()
+                }
+                new_data = DictConfig(new_data)
+            elif isinstance(data, ListConfig):
+                new_data = ListConfig([recurse(item) for item in data])
+            elif type(data) in {bool, str, int, float, type(None)}:
+                new_data = data
+            else:
+                raise RuntimeError(f"unknow type found : {type(data)}")
+            return new_data
+        return recurse(config)
+    def __init__(self, config):
+        self.config = RecursivePartial.replace_keys(
+            config, {"_rpartial_target_": "_target_"}
+        )
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return instantiate(self.config)
+class Partial(StrictPartial):
+    # remark : allow `path` argument to be exposed for easier use
+    def __init__(self, path, *args, **kwargs):
+        super().__init__(path, *args, **kwargs)
+def subkey(mapping, key):
+    return mapping[key]
+def make_set(*args):
+    return set(args)
+def make_tuple(*args):
+    return tuple(args)
+def make_list_from_kwargs(**kwargs):
+    # Filter out None/null values to avoid issues with callbacks
+    return [v for v in kwargs.values() if v is not None]
+def make_string(value):
+    return str(value)
+def make_dict(**kwargs):
+    return dict(kwargs)
+def get_item(data, key: str):
+    return data[key]
+def get_attr(data, key: str):
+    return getattr(data, key)

thirdparty/sam3d/sam3d/sam3d_objects/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/img_and_mask_transforms.py ADDED Viewed

	@@ -0,0 +1,986 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from collections import namedtuple
+import random
+from typing import Optional, Dict
+import numpy as np
+import matplotlib.pyplot as plt
+import torchvision.transforms.functional
+from sam3d_objects.data.dataset.tdfy.img_processing import pad_to_square_centered
+from sam3d_objects.model.backbone.dit.embedder.point_remapper import PointRemapper
+from typing import Optional, Dict
+from loguru import logger
+import torch
+import torch.nn.functional as F
+import torchvision
+import torchvision.transforms as tv_transforms
+import torchvision.transforms.functional
+import torchvision.transforms.functional as TF
+from sam3d_objects.data.dataset.tdfy.img_processing import pad_to_square_centered
+def UNNORMALIZE(mean, std):
+    mean = torch.tensor(mean).reshape((3, 1, 1))
+    std = torch.tensor(std).reshape((3, 1, 1))
+    def unnormalize_img(img):
+        assert img.ndim == 3 and img.shape[0] == 3
+        return img * std.to(img.device) + mean.to(img.device)
+    return unnormalize_img
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+IMAGENET_NORMALIZATION = tv_transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+IMAGENET_UNNORMALIZATION = UNNORMALIZE(IMAGENET_MEAN, IMAGENET_STD)
+class BoundingBoxError(Exception):
+    pass
+def check_bounding_box(bbox_w, bbox_h):
+    if bbox_w < 2 or bbox_h < 2:
+        raise BoundingBoxError("Bounding box dimensions must be at least 2x2.")
+class RGBAImageProcessor:
+    def __init__(
+        self,
+        resize_and_make_square_kwargs: Optional[Dict] = None,
+        object_crop_kwargs: Optional[Dict] = None,
+        remove_background: bool = False,
+        imagenet_normalization: bool = False,
+    ):
+        self.remove_background = remove_background
+        self.resize_and_pad_kwargs = resize_and_make_square_kwargs
+        self.object_crop_kwargs = object_crop_kwargs
+        self.imagenet_normalization = imagenet_normalization
+        if resize_and_make_square_kwargs is not None:
+            self.transforms = resize_and_make_square(**resize_and_make_square_kwargs)
+    def __call__(
+        self, image: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if mask is None:
+            assert (
+                image.shape[0] == 4
+            ), f"Requires 4 channels (RGB + alpha), got {image.shape[0]=}"
+            image, mask = split_rgba(image)
+        else:
+            assert (
+                image.shape[0] == 3
+            ), f"Requires 3 channels (RGB), got {image.shape[0]=}"
+            assert mask.dim() == 2, f"Requires 2D mask, got {mask.dim()=}"
+        if not self.object_crop_kwargs in [None, False]:
+            image, mask = crop_around_mask_with_padding(
+                image, mask, **self.object_crop_kwargs
+            )
+        if self.remove_background:
+            image, mask = rembg(image, mask)
+        image = self.transforms["img_transform"](image)
+        mask = self.transforms["mask_transform"](mask.unsqueeze(0))
+        if self.imagenet_normalization:
+            image = IMAGENET_NORMALIZATION(image)
+        return image, mask
+def load_rgb(fpath: str) -> torch.Tensor:
+    """
+    Load a RGB(A) image from a file path.
+    """
+    image = plt.imread(fpath)  # Why use matplotlib?
+    if image.dtype == "uint8":
+        image = image / 255
+        image = image.astype(np.float32)
+    image = torch.from_numpy(image)
+    image = image.permute(2, 0, 1).contiguous()
+    return image
+def concat_rgba(
+    rgb_image: torch.Tensor,
+    mask: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Create a 4-channel RGBA image from a 3-channel RGB image and a mask.
+    """
+    assert rgb_image.dim() == 3, f"{rgb_image.shape=}"
+    assert mask.dim() == 2, f"{mask.shape=}"
+    assert rgb_image.shape[0] == 3, f"{rgb_image.shape[0]=}"
+    assert rgb_image.shape[1:] == mask.shape, f"{rgb_image.shape[1:]=} != {mask.shape=}"
+    return torch.cat((rgb_image, mask[None, ...]), dim=0)
+def split_rgba(rgba_image: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Split a 4-channel RGBA image into a 3-channel RGB image and a 1-channel mask.
+    Args:
+        rgba_image: A 4-channel RGBA image.
+    Returns:
+        A tuple of (rgb_image, mask).
+    """
+    assert rgba_image.dim() == 3, f"{rgba_image.shape=}"
+    assert rgba_image.shape[0] == 4, f"{rgba_image.shape[0]=}"
+    return rgba_image[:3], rgba_image[3]
+def get_mask(
+    rgb_image: torch.Tensor,
+    depth_image: torch.Tensor,
+    mask_source: str,
+) -> torch.Tensor:
+    """
+    Extract a mask from either the alpha channel of an RGB image or a depth image.
+    Args:
+        rgb_image: Tensor of shape (B, C, H, W) or (C, H, W) where C >= 4 if using alpha channel
+        depth_image: Tensor of shape (B, 1, H, W) or (1, H, W) containing depth information
+        mask_source: Source of the mask, either "ALPHA_CHANNEL" or "DEPTH"
+    Returns:
+        mask: Tensor of shape (B, 1, H, W) or (1, H, W) containing the extracted mask
+    """
+    # Handle unbatched inputs (add batch dimension if needed)
+    is_batched = len(rgb_image.shape) == 4
+    if not is_batched:
+        rgb_image = rgb_image.unsqueeze(0)
+        if depth_image is not None:
+            depth_image = depth_image.unsqueeze(0)
+    if mask_source == "ALPHA_CHANNEL":
+        if rgb_image.shape[1] != 4:
+            logger.warning(f"No ALPHA CHANNEL for the image, cannot read mask.")
+            mask = None
+        else:
+            mask = rgb_image[:, 3:4, :, :]
+    elif mask_source == "DEPTH":
+        mask = depth_image
+    else:
+        raise ValueError(f"Invalid mask source: {mask_source}")
+    # Remove batch dimension if input was unbatched
+    if not is_batched:
+        mask = mask.squeeze(0)
+    return mask
+def rembg(image, mask, pointmap=None):
+    """
+    Remove the background from an image using a mask.
+    For pointmaps, sets background regions to NaN.
+    This function follows the standard transform pattern:
+    - If called with (image, mask), returns (image, mask)
+    - If called with (image, mask, pointmap), returns (image, mask, pointmap)
+    """
+    masked_image = image * mask
+    if pointmap is not None:
+        masked_pointmap = torch.where(mask > 0, pointmap, torch.nan)
+        return masked_image, mask, masked_pointmap
+    return masked_image, mask
+def resize_and_make_square(
+    img_size: int,
+    make_square: bool | str = False,
+):
+    """
+    Create image and mask transforms based on configuration.
+    Returns:
+        dict: {"img_transform": img_transform, "mask_transform": mask_transform}
+    """
+    if isinstance(make_square, str):
+        make_square = make_square.lower()
+    assert make_square in ["pad", "crop", False]
+    pre_resize_transform = tv_transforms.Lambda(lambda x: x)
+    post_resize_transform = tv_transforms.Lambda(lambda x: x)
+    if make_square == "pad":
+        pre_resize_transform = pad_to_square_centered
+    elif make_square == "crop":
+        post_resize_transform = tv_transforms.CenterCrop(img_size)
+    img_resize = tv_transforms.Resize(img_size)
+    mask_resize = tv_transforms.Resize(
+        img_size,
+        interpolation=tv_transforms.InterpolationMode.BILINEAR,
+    )
+    img_transform = tv_transforms.Compose(
+        [
+            pre_resize_transform,
+            img_resize,
+            post_resize_transform,
+        ]
+    )
+    mask_transform = tv_transforms.Compose(
+        [
+            pre_resize_transform,
+            mask_resize,
+            post_resize_transform,
+        ]
+    )
+    return {
+        "img_transform": img_transform,
+        "mask_transform": mask_transform,
+    }
+def crop_around_mask_with_random_box_size_factor(
+    loaded_image: torch.Tensor,
+    mask: torch.Tensor,
+    random_box_size_factor: float = 1.0,
+    pointmap: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    return crop_around_mask_with_padding(
+        loaded_image,
+        mask,
+        box_size_factor=1.0 + random.uniform(0, 1) * random_box_size_factor,
+        padding_factor=0.0,
+        pointmap=pointmap,
+    )
+def crop_around_mask_with_padding(
+    loaded_image: torch.Tensor,
+    mask: torch.Tensor,
+    box_size_factor: float = 1.6,
+    padding_factor: float = 0.1,
+    pointmap: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    # cast to ensure the function can be called normally
+    cast_mask = False
+    if mask.dim() == 3:
+        assert mask.shape[0] == 1, "cannot take mask with channel dimension not 1"
+        mask = mask[0]
+        cast_mask = True
+    loaded_image = concat_rgba(loaded_image, mask)
+    bbox = compute_mask_bbox(mask, box_size_factor)
+    loaded_image = torchvision.transforms.functional.crop(
+        loaded_image, bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
+    )
+    # Crop pointmap if provided
+    if pointmap is not None:
+        pointmap = torchvision.transforms.functional.crop(
+            pointmap, bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
+        )
+    C, H, W = loaded_image.shape
+    max_dim = max(H, W)  # Get the larger dimension
+    # Step 1: Pad to square shape
+    pad_h = (max_dim - H) // 2
+    pad_w = (max_dim - W) // 2
+    pad_h_extra = (max_dim - H) - pad_h  # To ensure even padding
+    pad_w_extra = (max_dim - W) - pad_w
+    loaded_image = torch.nn.functional.pad(
+        loaded_image, (pad_w, pad_w_extra, pad_h, pad_h_extra), mode="constant", value=0
+    )
+    if pointmap is not None:
+        pointmap = torch.nn.functional.pad(
+            pointmap,
+            (pad_w, pad_w_extra, pad_h, pad_h_extra),
+            mode="constant",
+            value=float("nan"),
+        )
+    # Step 2: Extend by 10% on each side; idk but this seems to have better results overall
+    if padding_factor > 0:
+        extend_size = int(max_dim * padding_factor)  # 10% extension on each side
+        loaded_image = torch.nn.functional.pad(
+            loaded_image,
+            (extend_size, extend_size, extend_size, extend_size),
+            mode="constant",
+            value=0,
+        )
+        if pointmap is not None:
+            pointmap = torch.nn.functional.pad(
+                pointmap,
+                (extend_size, extend_size, extend_size, extend_size),
+                mode="constant",
+                value=float("nan"),
+            )
+    rgb_image, mask = split_rgba(loaded_image)
+    if cast_mask:
+        mask = mask[None]
+    if pointmap is not None:
+        return rgb_image, mask, pointmap
+    return rgb_image, mask
+def compute_mask_bbox(
+    mask: torch.Tensor, box_size_factor: float = 1.0
+) -> tuple[float, float, float, float]:
+    """
+    Compute a bounding box around a binary mask with optional size adjustment.
+    Args:
+        mask: A 2D binary tensor where non-zero values represent the object of interest.
+        box_size_factor: Factor to scale the bounding box size. Values > 1.0 create a larger box.
+            Default is 1.0 (tight bounding box).
+    Returns:
+        A tuple of (x1, y1, x2, y2) coordinates representing the bounding box,
+        where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner.
+    Raises:
+        ValueError: If mask is not a torch.Tensor or not a 2D tensor.
+    """
+    if not isinstance(mask, torch.Tensor):
+        raise ValueError("Mask must be a torch.Tensor")
+    if not mask.dim() == 2:
+        raise ValueError("Mask must be a 2D tensor")
+    bbox_indices = torch.nonzero(mask)
+    if bbox_indices.numel() == 0:
+        # Handle empty mask case
+        return (0, 0, 0, 0)
+    y_indices = bbox_indices[:, 0]
+    x_indices = bbox_indices[:, 1]
+    min_x = torch.min(x_indices).item()
+    min_y = torch.min(y_indices).item()
+    max_x = torch.max(x_indices).item()
+    max_y = torch.max(y_indices).item()
+    bbox = (min_x, min_y, max_x, max_y)
+    center_x = (bbox[0] + bbox[2]) / 2
+    center_y = (bbox[1] + bbox[3]) / 2
+    bbox_w, bbox_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    check_bounding_box(bbox_w, bbox_h)
+    size = max(bbox_w, bbox_h, 2)
+    size = int(size * box_size_factor)
+    bbox = (
+        int(center_x - size // 2),
+        int(center_y - size // 2),
+        int(center_x + size // 2),
+        int(center_y + size // 2),
+    )
+    # bbox = tuple(map(int, bbox))
+    return bbox
+def crop_and_pad(image, bbox):
+    """
+    Crop an image using a bounding box and pad with zeros if out of bounds.
+    Args:
+        image (torch.Tensor): CxHxW image.
+        bbox (tuple): (x1, y1, x2, y2) bounding box.
+    Returns:
+        torch.Tensor: Cropped and zero-padded image.
+    """
+    C, H, W = image.shape
+    x1, y1, x2, y2 = bbox
+    # Ensure coordinates are integers
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    # Compute cropping coordinates
+    x1_pad, y1_pad = max(0, -x1), max(0, -y1)
+    x2_pad, y2_pad = max(0, x2 - W), max(0, y2 - H)
+    # Compute valid region in the original image
+    x1_crop, y1_crop = max(0, x1), max(0, y1)
+    x2_crop, y2_crop = min(W, x2), min(H, y2)
+    # Extract the valid part
+    cropped = image[:, y1_crop:y2_crop, x1_crop:x2_crop]
+    # Create a zero-padded output
+    padded = torch.zeros((C, y2 - y1, x2 - x1), dtype=image.dtype)
+    # Place the cropped image into the zero-padded array
+    padded[
+        :, y1_pad : y1_pad + cropped.shape[1], x1_pad : x1_pad + cropped.shape[2]
+    ] = cropped
+    return padded
+def resize_all_to_same_size(
+    rgb_image: torch.Tensor,
+    mask: torch.Tensor,
+    pointmap: Optional[torch.Tensor] = None,
+    target_size: Optional[tuple[int, int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Resize RGB image, mask, and pointmap to the same size.
+    This is crucial when pointmaps have different resolution than RGB images,
+    which must be done BEFORE any cropping operations.
+    Args:
+        rgb_image: RGB image tensor of shape (C, H, W)
+        mask: Mask tensor of shape (H, W) or (1, H, W)
+        pointmap: Optional pointmap tensor of shape (C_p, H_p, W_p)
+        target_size: Target size as (H, W). If None, uses RGB image size.
+    Returns:
+        Tuple of (resized_rgb, resized_mask, resized_pointmap)
+    """
+    squeeze_mask = (mask.dim() == 2)
+    if squeeze_mask:
+        mask = mask.unsqueeze(0)
+    if target_size is None:
+        target_size = (rgb_image.shape[1], rgb_image.shape[2])  # (H, W)
+    rgb_needs_resize = (rgb_image.shape[1], rgb_image.shape[2]) != target_size
+    if rgb_needs_resize:
+        rgb_image = torchvision.transforms.functional.resize(
+            rgb_image, target_size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+        )
+        mask = torchvision.transforms.functional.resize(
+            mask, target_size, interpolation=torchvision.transforms.InterpolationMode.NEAREST
+        )
+    if pointmap is not None:
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        if pointmap_size != target_size:
+            # Handle NaN values in pointmap during resizing
+            # Direct resize would propagate NaN values, so we need special handling
+            nan_mask = torch.isnan(pointmap).any(dim=0)
+            pointmap_clean = torch.where(torch.isnan(pointmap), torch.zeros_like(pointmap), pointmap)
+            pointmap_resized = torchvision.transforms.functional.resize(
+                pointmap_clean, target_size, interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+            )
+            # Resize the nan mask to identify which regions should remain invalid
+            nan_mask_resized = torchvision.transforms.functional.resize(
+                nan_mask.unsqueeze(0).float(), target_size,
+                interpolation=torchvision.transforms.InterpolationMode.NEAREST
+            ).squeeze(0) > 0.5
+            # Restore NaN values in regions that were originally invalid
+            pointmap = torch.where(
+                nan_mask_resized.unsqueeze(0).expand_as(pointmap_resized),
+                torch.full_like(pointmap_resized, float('nan')),
+                pointmap_resized
+            )
+    if squeeze_mask:
+        mask = mask.squeeze(0)
+    if pointmap is not None:
+        return rgb_image, mask, pointmap
+    return rgb_image, mask
+SSINormalizedPointmap = namedtuple("SSINormalizedPointmap", ["pointmap", "scale", "shift"])
+class SSIPointmapNormalizer:
+    def normalize(self, pointmap: torch.Tensor, mask: torch.Tensor,
+        scale: Optional[torch.Tensor] = None, shift: Optional[torch.Tensor] = None,
+    ) -> SSINormalizedPointmap:
+        if scale is None or shift is None:
+            normalized_pointmap, scale, shift = normalize_pointmap_ssi(pointmap)
+        else:
+            assert scale.shape == (3,) and shift.shape == (3,), "scale and shift must be in (3,) format"
+            normalized_pointmap = _apply_metric_to_ssi(pointmap, scale, shift)
+        return SSINormalizedPointmap(normalized_pointmap, scale, shift)
+    def denormalize(self, pointmap: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
+        pointmap = _apply_metric_to_ssi(pointmap, scale, shift, apply_inverse=True)
+        return pointmap
+class ObjectCentricSSI(SSIPointmapNormalizer):
+    def __init__(self,
+        use_scene_scale: bool = True,
+        quantile_drop_threshold: float = 0.1,
+        clip_beyond_scale: Optional[float] = None,
+        # scale_factor: float = 3.8076, # e^(1.337); empirical mean of R3+Artist train
+        scale_factor: float = 1.0, # e^(1.337); empirical mean of R3+Artist train
+        allow_scale_and_shift_override: bool = False,
+        raise_on_no_valid_points: bool = False,
+    ):
+        self.use_scene_scale = use_scene_scale
+        self.quantile_drop_threshold = quantile_drop_threshold
+        self.clip_beyond_scale = clip_beyond_scale
+        self.scale_factor = scale_factor
+        self.allow_scale_and_shift_override = allow_scale_and_shift_override
+        self.raise_on_no_valid_points = raise_on_no_valid_points
+    def _compute_scale_and_shift(self, pointmap: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        mask_resized = torchvision.transforms.functional.resize(
+            mask, pointmap_size,
+            interpolation=torchvision.transforms.InterpolationMode.NEAREST
+        ).squeeze(0)
+        pointmap_flat = pointmap.reshape(3, -1)
+        # Get valid points from the mask
+        mask_bool = mask_resized.reshape(-1) > 0.5
+        mask_points = pointmap_flat[:, mask_bool]
+        if mask_points.isfinite().max() == 0:
+            if self.raise_on_no_valid_points:
+                raise ValueError(f"No valid points found in mask")
+            logger.warning(f"No valid points found in mask; setting scale to {self.scale_factor} and shift to 0")
+            return torch.ones_like(pointmap_flat[:,0]) * self.scale_factor, torch.zeros_like(pointmap_flat[:,0])
+        # Compute median for shift
+        shift = mask_points.nanmedian(dim=-1).values
+        # logger.info(f"{pointmap.shape=} {mask_resized.shape=} {shift.shape=}")
+        if self.use_scene_scale == True:
+            # Normalize by the scene scale
+            points_centered = pointmap_flat - shift.unsqueeze(-1)
+            max_dims = points_centered.abs().max(dim=0).values
+            scale = max_dims.nanmedian(dim=-1).values
+        elif self.use_scene_scale == False:
+            # Normalize by the object scale
+            shifted_mask_points = mask_points - shift.unsqueeze(-1)
+            norm = shifted_mask_points.norm(dim=0)
+            quantiles = torch.nanquantile(norm,
+                torch.tensor([self.quantile_drop_threshold, 1. - self.quantile_drop_threshold],
+                device=shifted_mask_points.device),
+                dim=-1)
+            scale = (quantiles[1] - quantiles[0]).max(dim=-1).values * 2.0
+        elif self.use_scene_scale.upper() == "OBJECT_NORM_MEDIAN":
+            # Normalize by the object scale
+            shifted_mask_points = mask_points - shift.unsqueeze(-1)
+            norm = shifted_mask_points.norm(dim=0)
+            scale = norm.nanmedian(dim=-1).values
+        else:
+            raise ValueError(f"Invalid use_scene_scale: {self.use_scene_scale}")
+        scale = scale.expand_as(shift) # per-dim scaling
+        scale = scale * self.scale_factor
+        return scale, shift
+    def normalize(self, pointmap: torch.Tensor, mask: torch.Tensor,
+        scale: Optional[torch.Tensor] = None, shift: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # 1. resize mask to size of pointmap using nearest interpolation
+        # 2. get mask points: pointmap[mask > 0.5]
+        # 3. shift = mask_points.median() # xyz
+        # 4. scale = # filter. If no points, then
+        # logger.info(f"{pointmap.shape=} {mask.shape=}")
+        assert pointmap.shape[0] == 3, "pointmap must be in (3, H, W) format"
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        _scale, _shift = self._compute_scale_and_shift(pointmap, mask)
+        if scale is not None and self.allow_scale_and_shift_override:
+            _scale = scale
+        if shift is not None and self.allow_scale_and_shift_override:
+            _shift = shift
+        return_scale, return_shift = _scale, _shift
+        # Apply normalization
+        pointmap_normalized = _apply_metric_to_ssi(pointmap, return_scale, return_shift)
+        if self.clip_beyond_scale is not None and self.clip_beyond_scale > 0:
+            new_norm = pointmap_normalized.norm(dim=0)
+            pointmap_normalized = torch.where(
+                new_norm > self.clip_beyond_scale,
+                torch.full_like(pointmap_normalized, float('nan')),
+                pointmap_normalized
+            )
+        return SSINormalizedPointmap(pointmap_normalized, return_scale, return_shift)
+class ObjectApparentSizeSSI(SSIPointmapNormalizer):
+    def __init__(self,
+            clip_beyond_scale: Optional[float] = None,
+            use_scene_scale: bool = True,
+            scale_factor: float = 1.0, # e^(1.337); empirical mean of R3+Artist train
+        ):
+        self.clip_beyond_scale = clip_beyond_scale
+        self.use_scene_scale = use_scene_scale
+        self.scale_factor = scale_factor
+    def _get_scale_and_shift(self, pointmap: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        pointmap_flat = pointmap.reshape(3, -1)
+        if not self.use_scene_scale:
+            # Get valid points from the mask
+            mask_resized = torchvision.transforms.functional.resize(
+                mask, pointmap_size,
+                interpolation=torchvision.transforms.InterpolationMode.NEAREST
+            ).squeeze(0)
+            mask_bool = mask_resized.reshape(-1) > 0.5
+            pointmap_flat = pointmap_flat[:, mask_bool]
+        # Median z-distance
+        median_z = pointmap_flat[-1, ...].nanmedian().unsqueeze(0)
+        scale = median_z.expand(3) * self.scale_factor
+        shift = torch.zeros_like(scale)
+        # logger.info(f'median z = {median_z}')
+        return scale, shift
+    def normalize(self,
+        pointmap: torch.Tensor,
+        mask: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        shift: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert pointmap.shape[0] == 3, "pointmap must be in (3, H, W) format"
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        if scale is None or shift is None:
+            scale, shift = self._get_scale_and_shift(pointmap, mask)
+        else:
+            assert scale.shape == (3,) and shift.shape == (3,), "scale and shift must be in (3,) format"
+        # Apply normalization and clip
+        pointmap_normalized = _apply_metric_to_ssi(pointmap, scale, shift)
+        # logger.info(f"{pointmap_normalized.shape=}")
+        if self.clip_beyond_scale is not None and self.clip_beyond_scale > 0:
+            pointmap_normalized = torch.where(
+                pointmap_normalized[-1, ...] > self.clip_beyond_scale,
+                torch.full_like(pointmap_normalized, float('nan')),
+                pointmap_normalized
+            )
+        # return pointmap_normalized, scale, shift
+        return SSINormalizedPointmap(pointmap_normalized, scale, shift)
+class NormalizedDisparitySpaceSSI(SSIPointmapNormalizer):
+    def __init__(self,
+        clip_beyond_scale: Optional[float] = None,
+        use_scene_scale: bool = True,
+        log_disparity_shift: float = 0.0,
+    ):
+        self.clip_beyond_scale = clip_beyond_scale
+        self.use_scene_scale = use_scene_scale
+        self.point_remapper = PointRemapper(remap_type="exp_disparity")
+        self.log_disparity_shift = log_disparity_shift
+    def normalize(self, pointmap: torch.Tensor, mask: torch.Tensor,
+        scale: Optional[torch.Tensor] = None, shift: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert pointmap.shape[0] == 3, "pointmap must be in (3, H, W) format"
+        disparity_space_pointmap = self.point_remapper.forward(pointmap.permute(1, 2, 0)).permute(2, 0, 1)
+        if scale is None or shift is None:
+            scale, shift = self._get_scale_and_shift(disparity_space_pointmap, mask)
+        else:
+            assert scale.shape == (3,) and shift.shape == (3,), "scale and shift must be in (3,) format"
+        # pointmap_normalized = pointmap.clone().detach()
+        pointmap_normalized = _apply_metric_to_ssi(disparity_space_pointmap, scale, shift)
+        # logger.info(f"{pointmap_normalized.shape=}")
+        if self.clip_beyond_scale is not None and self.clip_beyond_scale > 0:
+            pointmap_normalized = torch.where(
+                pointmap_normalized[2, ...].abs() > self.clip_beyond_scale,
+                torch.full_like(pointmap_normalized, float('nan')),
+                pointmap_normalized
+            )
+        # return pointmap_normalized, scale, shift
+        return SSINormalizedPointmap(pointmap_normalized, scale, shift)
+    def denormalize(self, pointmap: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
+        pointmap = _apply_metric_to_ssi(pointmap, scale, shift, apply_inverse=True)
+        pointmap = self.point_remapper.inverse(pointmap.permute(1, 2, 0)).permute(2, 0, 1)
+        return pointmap
+    def _get_scale_and_shift(self, pointmap: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        pointmap_size = (pointmap.shape[1], pointmap.shape[2])
+        mask_resized = torchvision.transforms.functional.resize(
+            mask, pointmap_size,
+            interpolation=torchvision.transforms.InterpolationMode.NEAREST
+        ).squeeze(0)
+        pointmap_flat = pointmap.reshape(3, -1)
+        if self.use_scene_scale:
+            median_z = pointmap_flat[-1, ...].nanmedian().unsqueeze(0)
+            shift = torch.zeros_like(median_z.expand(3))
+            shift[-1, ...] = median_z[0] + self.log_disparity_shift
+        else:
+            # Get valid points from the mask (shift, x/z, y/z, log(z))
+            mask_bool = mask_resized.reshape(-1) > 0.5
+            pointmap_flat = pointmap_flat[:, mask_bool]
+            shift = pointmap_flat.nanmedian(dim=-1).values
+        scale = torch.ones_like(shift)
+        # logger.info(f'median z = {median_z}')
+        return scale, shift
+def normalize_pointmap_ssi(pointmap: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Normalize pointmap using Scale-Shift Invariant (SSI) normalization.
+    Args:
+        pointmap: Pointmap tensor of shape (H, W, 3) or (3, H, W)
+    Returns:
+        Tuple of (normalized_pointmap, scale, shift)
+    """
+    from sam3d_objects.data.dataset.tdfy.pose_target import ScaleShiftInvariant
+    # Convert to (H, W, 3) if needed for get_scale_and_shift
+    if pointmap.shape[0] == 3:
+        pointmap_hw3 = pointmap.permute(1, 2, 0)
+        original_format = 'chw'
+    else:
+        pointmap_hw3 = pointmap
+        original_format = 'hwc'
+    # Get scale and shift using existing method
+    scale, shift = ScaleShiftInvariant.get_scale_and_shift(pointmap_hw3)
+    pointmap_normalized = _apply_metric_to_ssi(pointmap, scale, shift)
+    return pointmap_normalized, scale, shift
+def _apply_metric_to_ssi(pointmap: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor, apply_inverse: bool = False) -> torch.Tensor:
+    """
+    Normalize pointmap using Scale-Shift Invariant (SSI) normalization.
+    Args:
+        pointmap: Pointmap tensor of shape (H, W, 3) or (3, H, W)
+    Returns:
+        Tuple of (normalized_pointmap, scale, shift)
+    """
+    from sam3d_objects.data.dataset.tdfy.pose_target import ScaleShiftInvariant
+    # Convert to (H, W, 3) if needed for get_scale_and_shift
+    if pointmap.shape[0] == 3:
+        pointmap_hw3 = pointmap.permute(1, 2, 0)
+        original_format = 'chw'
+    else:
+        pointmap_hw3 = pointmap
+        original_format = 'hwc'
+    # Apply normalization
+    ssi_to_metric = ScaleShiftInvariant.ssi_to_metric(scale, shift)
+    metric_to_ssi = ssi_to_metric.inverse()
+    transform_to_apply = metric_to_ssi
+    if apply_inverse:
+        transform_to_apply = ssi_to_metric
+    pointmap_flat = pointmap_hw3.reshape(-1, 3)
+    pointmap_normalized = transform_to_apply.transform_points(pointmap_flat)
+    # Reshape back to original format
+    if original_format == 'chw':
+        pointmap_normalized = pointmap_normalized.reshape(pointmap.shape[1], pointmap.shape[2], 3).permute(2, 0, 1)
+    else:
+        pointmap_normalized = pointmap_normalized.reshape(pointmap_hw3.shape)
+    return pointmap_normalized
+def perturb_mask_translation(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    max_px_delta: int = 5,
+):
+    """
+    Applies data augmentation to the mask by randomly translating the mask.
+    Args:
+        image: (C, H, W) float32 [0, 1] tensor.
+        mask: (1, H, W) float32 [0, 1] tensor.
+        max_px_delta: The maximum number of pixels we will randomly shift by in each 2D direction.
+    """
+    dx = random.randint(-max_px_delta, max_px_delta)
+    dy = random.randint(-max_px_delta, max_px_delta)
+    mask = mask.squeeze(0)
+    mask = torch.roll(mask, shifts=(dy, dx), dims=(0, 1))
+    # Zero out wrapped regions
+    if dy > 0:
+        mask[:dy, :] = 0
+    elif dy < 0:
+        mask[dy:, :] = 0
+    if dx > 0:
+        mask[:, :dx] = 0
+    elif dx < 0:
+        mask[:, dx:] = 0
+    mask = mask.unsqueeze(0)
+    return image, mask
+def perturb_mask_boundary(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    kernel_range: tuple[int, int] = (2, 5),
+    p_erode: float = 0.1,
+    p_dilate: float = 0.8,
+    **kwargs,
+):
+    """
+    Applies data augmentation to the mask by randomly eroding or dilating the mask.
+    Args:
+        image: (C, H, W) float32 [0, 1] tensor.
+        mask: (1, H, W) float32 [0, 1] tensor.
+        kernel_range: Range of kernel sizes to sample from.
+        p_erode: Probability of erosion.
+        p_dilate: Probability of dilation.
+        kwargs: Kwargs for the cv2 erode/dilate function.
+    """
+    import cv2
+    C, H, W = image.shape
+    assert mask.shape == (1, H, W)
+    assert mask.dtype == torch.float32
+    assert torch.all((mask == 0) | (mask == 1)), "Mask must be binary (0 or 1)"
+    p_none = 1.0 - p_erode - p_dilate
+    assert 0 <= p_none <= 1, "Probabilities must sum to 1 and be valid."
+    # Sample operation.
+    op = random.choices(["erode", "dilate", "none"], weights=[p_erode, p_dilate, p_none], k=1)[0]
+    if op == "none":
+        pass
+    else:
+        # Sample kernel size
+        ksize = random.randint(*kernel_range)
+        kernel = np.ones((ksize, ksize), np.uint8)
+        mask = mask.squeeze().cpu().numpy().astype(np.uint8)  # (H, W)
+        if op == "erode":
+            mask = cv2.erode(mask, kernel, **kwargs)
+        elif op == "dilate":
+            mask = cv2.dilate(mask, kernel, **kwargs)
+        else:
+            raise NotImplementedError
+        mask = torch.from_numpy(mask).float()[None]  # (1, H, W)
+    return image, mask
+def resolution_blur(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    scale_range=(0.05, 0.95),
+    interpolation_down=tv_transforms.InterpolationMode.BICUBIC,
+    interpolation_up=tv_transforms.InterpolationMode.BICUBIC,
+):
+    """
+    Blur the input image by applying upsample(downsample(x)).
+    Args:
+        image (torch.Tensor): Image tensor of shape (C, H, W), float32, with values in [0, 1].
+        mask (torch.Tensor): Mask tensor of shape (1, H, W), float32, with values in [0, 1]. The mask is returned unchanged.
+        scale_range: Tuple of (min_scale, max_scale) for downsampling.
+        interpolation_down: Interpolation mode for downsampling.
+        interpolation_up: Interpolation mode for upsampling.
+    """
+    C, H, W = image.shape
+    scale = random.uniform(*scale_range)
+    new_H, new_W = max(1, int(H * scale)), max(1, int(W * scale))
+    # Downsample
+    image = TF.resize(image, size=[new_H, new_W], interpolation=interpolation_down)
+    # Upsample back to original size
+    image = TF.resize(image, size=[H, W], interpolation=interpolation_up)
+    return image, mask
+def gaussian_blur(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    kernel_range: tuple[int, int] = (3, 15),
+    sigma_range: tuple[int, int] = (0.1, 4.0),
+):
+    """
+    Apply gaussian blur to the input image.
+    Args:
+        image (torch.Tensor): Image tensor of shape (C, H, W), float32, with values in [0, 1].
+        mask (torch.Tensor): Mask tensor of shape (1, H, W), float32, with values in [0, 1]. The mask is returned unchanged.
+        kernel_range (tuple): Range of odd kernel sizes to sample from for the Gaussian blur (min, max).
+        sigma_range (tuple): Range of sigma values (standard deviation) to sample from for the Gaussian kernel (min, max).
+    """
+    kernel_size = random.choice([k for k in range(kernel_range[0], kernel_range[1]+1) if k % 2 == 1])
+    sigma = random.uniform(*sigma_range)
+    pad = kernel_size // 2
+    # Step 1: Pad the image
+    image = F.pad(image.unsqueeze(0), (pad, pad, pad, pad), mode='replicate')
+    # Step 2: Apply gaussian blur
+    image = TF.gaussian_blur(image, kernel_size=[kernel_size, kernel_size], sigma=sigma)
+    # Step 3: Unpad to get back to original size
+    image = image[:, :, pad:-pad, pad:-pad]
+    return image.squeeze(0), mask
+def apply_blur_augmentation(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    p_resolution: float = 0.33,
+    p_gaussian: float = 0.33,
+    gaussian_kwargs: dict = None,
+    resolution_kwargs: dict = None,
+):
+    """Apply blur augmentation with configurable parameters"""
+    # Handle None defaults BEFORE unpacking
+    if gaussian_kwargs is None:
+        gaussian_kwargs = {}
+    if resolution_kwargs is None:
+        resolution_kwargs = {}
+    p_none = 1.0 - p_gaussian - p_resolution
+    assert 0 <= p_none <= 1, "Probabilities must sum to 1 and be valid."
+    operation = random.choices(
+        ["gaussian", "resolution", "none"],
+        weights=[p_gaussian, p_resolution, p_none],
+        k=1
+    )[0]
+    if operation == "gaussian":
+        return gaussian_blur(image, mask, **gaussian_kwargs)
+    elif operation == "resolution":
+        return resolution_blur(image, mask, **resolution_kwargs)
+    elif operation == "none":
+        return image, mask
+    else:
+        raise NotImplementedError

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/img_processing.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+import random
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.transforms import functional as tv_F
+class RandomResizedCrop(transforms.RandomResizedCrop):
+    """
+    RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
+    This may lead to results different with torchvision's version.
+    Following BYOL's TF code:
+    https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
+    """
+    @staticmethod
+    def get_params(img, scale, ratio):
+        width, height = tv_F._get_image_size(img)
+        area = height * width
+        target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+        log_ratio = torch.log(torch.tensor(ratio))
+        aspect_ratio = torch.exp(
+            torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
+        ).item()
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+        w = min(w, width)
+        h = min(h, height)
+        i = torch.randint(0, height - h + 1, size=(1,)).item()
+        j = torch.randint(0, width - w + 1, size=(1,)).item()
+        return i, j, h, w
+# following PT3D CO3D data to pad image
+def pad_to_square(image, value=0):
+    _, _, h, w = image.shape  # Assuming image is in (B, C, H, W) format
+    if h == w:
+        return image  # The image is already square
+    # Calculate the padding
+    diff = abs(h - w)
+    pad2 = diff
+    # Pad the image to make it square
+    if h > w:
+        padding = (0, pad2, 0, 0)  # Pad width (left, right, top, bottom)
+    else:
+        padding = (0, 0, 0, pad2)  # Pad height
+    # Apply padding
+    padded_image = torch.nn.functional.pad(image, padding, mode="constant", value=value)
+    return padded_image
+def preprocess_img(
+    x,
+    mask=None,
+    img_target_shape=224,
+    mask_target_shape=256,
+    normalize=False,
+):
+    if x.shape[1] != x.shape[2]:
+        x = pad_to_square(x)
+    if mask is not None and mask.shape[1] != mask.shape[2]:
+        mask = pad_to_square(mask)
+    if x.shape[2] != img_target_shape:
+        x = F.interpolate(
+            x,
+            size=(img_target_shape, img_target_shape),
+            # scale_factor=float(img_target_shape)/x.shape[2],
+            mode="bilinear",
+        )
+    if mask is not None and mask.shape[2] != mask_target_shape:
+        if mask is not None:
+            mask = F.interpolate(
+                mask,
+                size=(mask_target_shape, mask_target_shape),
+                # scale_factor=float(mask_target_shape)/mask.shape[2],
+                mode="nearest",
+            )
+    if normalize:
+        imgs_normed = resnet_img_normalization(x)
+    else:
+        imgs_normed = x
+    return imgs_normed, mask
+def resnet_img_normalization(x):
+    resnet_mean = torch.tensor([0.485, 0.456, 0.406], device=x.device).reshape(
+        (3, 1, 1)
+    )
+    resnet_std = torch.tensor([0.229, 0.224, 0.225], device=x.device).reshape((3, 1, 1))
+    if x.ndim == 4:
+        resnet_mean = resnet_mean[None]
+        resnet_std = resnet_std[None]
+    x = (x - resnet_mean) / resnet_std
+    return x
+# pad image to be centered for unprojecting depth
+def pad_to_square_centered(image, value=0, pointmap=None):
+    h, w = image.shape[-2], image.shape[-1]  # Assuming image is in (B, C, H, W) format
+    if h == w:
+        if pointmap is not None:
+            return image, pointmap
+        return image  # The image is already square
+    # Calculate the padding
+    diff = abs(h - w)
+    pad1 = diff // 2
+    pad2 = diff - pad1
+    # Pad the image to make it square
+    if h > w:
+        padding = (pad1, pad2, 0, 0)  # Pad width (left, right, top, bottom)
+    else:
+        padding = (0, 0, pad1, pad2)  # Pad height
+    # Apply padding to image
+    padded_image = F.pad(image, padding, mode="constant", value=value)
+    # Apply padding to pointmap if provided
+    if pointmap is not None:
+        # Pad pointmap using torch functional with NaN fill value
+        padded_pointmap = F.pad(pointmap, padding, mode="constant", value=float("nan"))
+        return padded_image, padded_pointmap
+    return padded_image
+def crop_img_to_obj(mask, context_size):
+    nonzeros = torch.nonzero(mask)
+    if len(nonzeros) > 0:
+        r_max, c_max = nonzeros.max(dim=0)[0]
+        r_min, c_min = nonzeros.min(dim=0)[0]
+        box_h = max(1, r_max - r_min)
+        box_w = max(1, c_max - c_min)
+        left = max(0, c_min - int(box_w * context_size))
+        right = min(mask.shape[-1], c_max + int(box_w * context_size))
+        top = max(0, r_min - int(box_h * context_size))
+        bot = min(mask.shape[-2], r_max + int(box_h * context_size))
+        return left, right, top, bot
+    return None, None, None, None
+def random_pad(img, mask=None, max_ratio=0.0, pointmap=None):
+    max_size = int(max(img.shape) * max_ratio)
+    padding = tuple([random.randint(0, max_size) for _ in range(4)])
+    img = F.pad(img, padding)
+    if mask is not None:
+        mask = F.pad(mask, padding)
+    if pointmap is not None:
+        pointmap = F.pad(pointmap, padding, mode="constant", value=float("nan"))
+        return img, mask, pointmap
+    return img, mask
+def get_img_color_augmentation(
+    color_jit_prob=0.5,
+    gaussian_blur_prob=0.1,
+):
+    transform = transforms.Compose(
+        [
+            # (a) Random Color Jitter
+            transforms.RandomApply(
+                [
+                    transforms.ColorJitter(
+                        brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1
+                    )
+                ],
+                p=color_jit_prob,
+            ),
+            # (b) Randomly apply GaussianBlur
+            transforms.RandomApply(
+                [transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0))],
+                p=gaussian_blur_prob,
+            ),
+        ]
+    )
+    return transform

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/pose_target.py ADDED Viewed

	@@ -0,0 +1,784 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from typing import Dict, Optional, Tuple, Any
+from dataclasses import dataclass, asdict, field
+from loguru import logger
+from sam3d_objects.data.utils import expand_as_right, tree_tensor_map
+from sam3d_objects.data.dataset.tdfy.transforms_3d import compose_transform, decompose_transform
+from pytorch3d.transforms import Transform3d, quaternion_to_matrix, matrix_to_quaternion
+@dataclass
+class InstancePose:
+    """
+    Stores the pose of an object.
+    Also, stores some information about the scene that was used to normalize the pose.
+    """
+    instance_scale_l2c: torch.Tensor
+    instance_position_l2c: torch.Tensor
+    instance_quaternion_l2c: torch.Tensor
+    scene_scale: torch.Tensor
+    scene_shift: torch.Tensor
+    @classmethod
+    def _broadcast_postcompose(
+        cls,
+        scale: torch.Tensor,
+        rotation: torch.Tensor,
+        translation: torch.Tensor,
+        transform_to_postcompose: Transform3d,
+    ) -> Transform3d:
+        """
+        Assumes scale, rotation, translation are of shape:
+            B, K, C
+            ---
+            B: batch size
+            K: number of objects
+            C: number of channels
+        Takes a transform where
+            get_matrix() has shape (B, 3, 3)
+        Returns pose.compose(transform_to_postcompose)
+        """
+        scale_c = scale.shape[-1]
+        ndim_orig = scale.ndim
+        if ndim_orig == 3:
+            b, k, _ = scale.shape
+        elif ndim_orig == 2:
+            b = scale.shape[0]
+            k = 1
+        elif ndim_orig == 1:
+            b = 1
+            k = 1
+        else:
+            raise ValueError(f"Invalid scale shape: {scale.shape}")
+        # Create transform of shape (B * K)
+        wide = {"scale": scale, "rotation": rotation, "translation": translation}
+        shapes_orig = {k: v.shape for k, v in wide.items()}
+        long = tree_tensor_map(lambda x: x.reshape(b * k, x.shape[-1]), wide)
+        long["rotation"] = quaternion_to_matrix(long["rotation"])
+        if scale_c == 1:
+            long["scale"] = long["scale"].expand(b * k, 3)
+        composed = compose_transform(**long)
+        # Apply transform to shape (B * K)
+        pc_transform = transform_to_postcompose.get_matrix()
+        pc_transform = pc_transform.repeat(k, 1, 1)
+        stacked_pc_transform = Transform3d(matrix=pc_transform)
+        assert stacked_pc_transform.get_matrix().shape == composed.get_matrix().shape
+        postcomposed = composed.compose(stacked_pc_transform)
+        # Decompose transform to shape (B, K, C)
+        scale, rotation, translation = decompose_transform(postcomposed)
+        rotation = matrix_to_quaternion(rotation)
+        pc_long = {"scale": scale, "rotation": rotation, "translation": translation}
+        pc_wide = tree_tensor_map(lambda x: x.reshape(b, k, x.shape[-1]), pc_long)
+        if scale_c == 1:
+            pc_wide["scale"] = pc_wide["scale"][..., 0].unsqueeze(-1)
+        for k, shape in shapes_orig.items():
+            pc_wide[k] = pc_wide[k].reshape(*shape)
+        return pc_wide["scale"], pc_wide["rotation"], pc_wide["translation"]
+@dataclass
+class PoseTarget:
+    x_instance_scale: torch.Tensor
+    x_instance_rotation: torch.Tensor
+    x_instance_translation: torch.Tensor
+    x_scene_scale: torch.Tensor
+    x_scene_center: torch.Tensor
+    x_translation_scale: torch.Tensor
+    pose_target_convention: str = field(default="unknown")
+@dataclass
+class InvariantPoseTarget:
+    """
+    This is the canonical representation of pose targets, used for computing metrics.
+        instance_pose <-> invariant_pose_targets <-> all other pose_target_conventions
+    Background:
+    ---
+    We want to estimate a transformation T: R³ → R³ despite scene scale ambiguity.
+    The transformation taking object points to scene points is defined as
+        T(x) = s · R(q) · x + t
+        where:
+            - x is a point in the object coordinate frame,
+            - q is a unit quaternion representing rotation,
+            - s is the object-to-scene scale, and
+            - t is the translation.
+    However, there is an inherent scale ambiguity in the scene, denoted as s_scene;
+    This ambiguity introduces irreducible error that complicates both evaluation and training.
+    To decouple the scene scale from the invariant quantities, we define:
+        T(x)  = s_scene · |t_rel| [ s_tilde · R(q) · x + t_unit ]
+        where we define
+            t_rel = t / s_scene
+            s_rel = s / s_scene
+            s_tilde = s_rel / |t_rel|
+            t_unit = t_rel / |t_rel|
+    During training, you would predict (q, s_tilde, t_unit), leaving s_scene separate.
+    Hand-wavy error analysis:
+    ---
+    1. Naive (coupled) estimate:
+       T(x) = s_scene [ s_rel · R(q) · x + t_rel ]
+       We can define:
+           U = ln(s_rel)
+           V = ln(|t_rel|)
+       so that the error is governed by Var(U + V).
+    2. In the decoupled case, we have:
+       T(x) = s_scene · |t_rel| [ s_tilde · R(q) · x + t_unit ]
+            = s_scene · |t_rel| [ (s_rel / |t_rel|) R(q) · x + t_unit ]
+       Then ln(s_tilde) = ln(s_rel) - ln(|t_rel|) = U - V, and the error is
+       Var(U - V) = Var(U) + Var(V) - 2Cov(U, V).
+    """
+    # These are invariant
+    q: torch.Tensor
+    t_unit: torch.Tensor
+    s_scene: torch.Tensor
+    t_scene_center: Optional[torch.Tensor] = None
+    t_rel_norm: Optional[torch.Tensor] = None
+    s_tilde: Optional[torch.Tensor] = None
+    s_rel: Optional[torch.Tensor] = None
+    def __post_init__(self):
+        # Check that fields that are required always have values.
+        if self.q is None:
+            raise ValueError("Field 'q' (quaternion) must be provided.")
+        if self.s_scene is None:
+            raise ValueError("Field 's_scene' must be provided.")
+        if self.s_rel is None:
+            if self.s_tilde is not None:
+                self.s_rel = self.s_tilde * self.t_rel_norm
+            else:
+                raise ValueError("Field 's_rel' or 's_tilde' must be provided.")
+        if self.t_unit is None:
+            raise ValueError("Field 't_unit' must be provided.")
+        if self.t_scene_center is None:
+            self.t_scene_center = torch.zeros_like(self.t_unit)
+        # There is a simple relationship between s_tilde and t_rel_norm:
+        #    s_tilde = s_rel / t_rel_norm
+        #
+        # If one of these is missing and the other is provided, we can compute the missing field.
+        if self.s_tilde is None and self.t_rel_norm is not None:
+            self.s_tilde = self.s_rel / self.t_rel_norm
+        elif self.t_rel_norm is None and self.s_tilde is not None:
+            self.t_rel_norm = self.s_rel / self.s_tilde
+        # If both are provided, we check for consistency.
+        if self.s_tilde is not None and self.t_rel_norm is not None:
+            computed_s_tilde = self.s_rel / self.t_rel_norm
+            # If the provided s_tilde deviates from what is computed, update it.
+            if not torch.allclose(self.s_tilde, computed_s_tilde, atol=1e-6):
+                logger.warning(
+                    f"s_tilde and t_rel_norm are provided, but they are not consistent. "
+                    f"Updating s_tilde to {computed_s_tilde}."
+                )
+                self.s_tilde = computed_s_tilde
+        self._validate_fields()
+    def _validate_fields(self):
+        for field in self.__dict__:
+            if self.__dict__[field] is None:
+                raise ValueError(f"Field '{field}' must be provided.")
+    @staticmethod
+    def from_instance_pose(instance_pose: InstancePose) -> "InvariantPoseTarget":
+        q = instance_pose.instance_quaternion_l2c
+        s_obj_to_scene = instance_pose.instance_scale_l2c      # (..., 1) or (..., 3)
+        t_obj_to_scene = instance_pose.instance_position_l2c   # (..., 3)
+        s_scene = instance_pose.scene_scale                    # (..., 1) or scalar-broadcastable
+        t_scene_center = instance_pose.scene_shift             # (..., 3)
+        # Normalize to scene scale (per the derivation)
+        if not ( s_obj_to_scene.ndim == (s_scene.ndim + 1)):
+            raise ValueError(f"s_scene should be ND [...,3] and s_obj_to_scene should be (N+1)D [...,K,3], but got {s_scene.shape=} {s_obj_to_scene.shape=}")
+        if not (t_obj_to_scene.ndim == (s_scene.ndim + 1)):
+            raise ValueError(f"t_scene_center should be ND [B,3] and t_obj_to_scene should be (N+1)D [B,K,3], but got {t_scene_center.shape=} {t_obj_to_scene.shape=}")
+        s_scene_exp = s_scene.unsqueeze(-2)
+        s_rel = s_obj_to_scene / s_scene_exp
+        t_rel = t_obj_to_scene / s_scene_exp
+        # Robust norms
+        eps = 1e-8
+        t_rel_norm = t_rel.norm(dim=-1, keepdim=True).clamp_min(eps)
+        s_tilde = s_rel / t_rel_norm
+        t_unit = t_rel / t_rel_norm
+        return InvariantPoseTarget(
+            q=q,
+            s_scene=s_scene,
+            t_scene_center=t_scene_center,
+            s_rel=s_rel,
+            s_tilde=s_tilde,
+            t_unit=t_unit,
+            t_rel_norm=t_rel_norm,
+        )
+    @staticmethod
+    def to_instance_pose(invariant_targets: "InvariantPoseTarget") -> InstancePose:
+        # scale factor per the derivation: s_scene * |t_rel|
+        # Normalize to scene scale (per the derivation)
+        t_rel_norm_ndim = invariant_targets.t_rel_norm.ndim
+        if not (invariant_targets.s_scene.ndim == (t_rel_norm_ndim - 1)) :
+            raise ValueError(f"s_scene should be ND [...,3] and t_rel_norm should be (N+1)D [...,K,3], but got {invariant_targets.s_scene.shape=} {invariant_targets.t_rel_norm.shape=}")
+        scale = invariant_targets.s_scene.unsqueeze(-2) * invariant_targets.t_rel_norm
+        return InstancePose(
+            instance_scale_l2c=invariant_targets.s_tilde * scale,
+            instance_position_l2c=invariant_targets.t_unit * scale,
+            instance_quaternion_l2c=invariant_targets.q,
+            scene_scale=invariant_targets.s_scene,
+            scene_shift=invariant_targets.t_scene_center,
+        )
+class PoseTargetConvention:
+    """
+    Converts pose_targets <-> instance_pose <-> invariant_pose_targets
+    """
+    pose_target_convention: str
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget) -> PoseTarget:
+        raise NotImplementedError("Implement this in a subclass")
+    @classmethod
+    def to_invariant(cls, instance_pose: InstancePose) -> InvariantPoseTarget:
+        raise NotImplementedError("Implement this in a subclass")
+    @classmethod
+    def from_instance_pose(cls, instance_pose: InstancePose) -> PoseTarget:
+        invariant_targets = InvariantPoseTarget.from_instance_pose(instance_pose)
+        return cls.from_invariant(invariant_targets)
+    @classmethod
+    def to_instance_pose(cls, pose_target: PoseTarget) -> InstancePose:
+        invariant_targets = cls.to_invariant(pose_target)
+        return InvariantPoseTarget.to_instance_pose(invariant_targets)
+class ScaleShiftInvariant(PoseTargetConvention):
+    """
+    Midas eq. (6): https://arxiv.org/pdf/1907.01341v3
+    But for pointmaps (see MoGe): https://arxiv.org/pdf/2410.19115
+    """
+    pose_target_convention: str = "ScaleShiftInvariant"
+    scale_mean = torch.tensor([1.0232692956924438, 1.0232691764831543, 1.0232692956924438]).to(torch.float32)
+    scale_std = torch.tensor([1.3773751258850098, 1.3773752450942993, 1.3773750066757202]).to(torch.float32)
+    translation_mean = torch.tensor([0.003191213821992278, 0.017236359417438507, 0.9401122331619263]).to(torch.float32)
+    translation_std = torch.tensor([1.341888666152954, 0.7665449380874634, 3.175130605697632]).to(torch.float32)
+    @classmethod
+    def from_instance_pose(cls, instance_pose: InstancePose, normalize: bool = False) -> PoseTarget:
+        metric_to_ssi = cls.ssi_to_metric(
+            instance_pose.scene_scale, instance_pose.scene_shift
+        ).inverse()
+        ssi_scale, ssi_rotation, ssi_translation = InstancePose._broadcast_postcompose(
+            scale=instance_pose.instance_scale_l2c,
+            rotation=instance_pose.instance_quaternion_l2c,
+            translation=instance_pose.instance_position_l2c,
+            transform_to_postcompose=metric_to_ssi,
+        )
+        # logger.info(f"{normalize=} {ssi_scale.shape=} {ssi_rotation.shape=} {ssi_translation.shape=}")
+        if normalize:
+            device = ssi_scale.device
+            ssi_scale = (ssi_scale - cls.scale_mean.to(device)) / cls.scale_std.to(device)
+            ssi_translation = (ssi_translation - cls.translation_mean.to(device)) / cls.translation_std.to(device)
+        return PoseTarget(
+            x_instance_scale=ssi_scale,
+            x_instance_rotation=ssi_rotation,
+            x_instance_translation=ssi_translation,
+            x_scene_scale=instance_pose.scene_scale,
+            x_scene_center=instance_pose.scene_shift,
+            x_translation_scale=torch.ones_like(ssi_scale)[..., 0].unsqueeze(-1),
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_instance_pose(cls, pose_target: PoseTarget, normalize: bool = False) -> InstancePose:
+        scene_scale = pose_target.x_scene_scale
+        scene_shift = pose_target.x_scene_center
+        ssi_to_metric = cls.ssi_to_metric(scene_scale, scene_shift)
+        if normalize:
+            device = pose_target.x_instance_scale.device
+            pose_target.x_instance_scale = pose_target.x_instance_scale * cls.scale_std.to(device) + cls.scale_mean.to(device)
+            pose_target.x_instance_translation = pose_target.x_instance_translation * cls.translation_std.to(device) + cls.translation_mean.to(device)
+        ins_scale, ins_rotation, ins_translation = InstancePose._broadcast_postcompose(
+            scale=pose_target.x_instance_scale,
+            rotation=pose_target.x_instance_rotation,
+            translation=pose_target.x_instance_translation,
+            transform_to_postcompose=ssi_to_metric,
+        )
+        return InstancePose(
+            instance_scale_l2c=ins_scale,
+            instance_position_l2c=ins_translation,
+            instance_quaternion_l2c=ins_rotation,
+            scene_scale=scene_scale,
+            scene_shift=scene_shift,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget, normalize: bool = False) -> InvariantPoseTarget:
+        instance_pose = cls.to_instance_pose(pose_target, normalize=normalize)
+        return InvariantPoseTarget.from_instance_pose(instance_pose)
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget, normalize: bool = False) -> PoseTarget:
+        instance_pose = InvariantPoseTarget.to_instance_pose(invariant_targets)
+        return cls.from_instance_pose(instance_pose, normalize=normalize)
+    @classmethod
+    def get_scale_and_shift(cls, pointmap):
+        shift_z = pointmap[..., -1].nanmedian().unsqueeze(0)
+        shift = torch.zeros_like(shift_z.expand(1, 3))
+        shift[..., -1] = shift_z
+        shifted_pointmap = pointmap - shift
+        scale = shifted_pointmap.abs().nanmean().to(shift.device)
+        shift = shift.reshape(3)
+        scale = scale.expand(3)
+        return scale, shift
+    @staticmethod
+    def ssi_to_metric(scale: torch.Tensor, shift: torch.Tensor):
+        if scale.ndim == 1:
+            scale = scale.unsqueeze(0)
+        if shift.ndim == 1:
+            shift = shift.unsqueeze(0)
+        return Transform3d().scale(scale).translate(shift).to(shift.device)
+class ScaleShiftInvariantWTranslationScale(PoseTargetConvention):
+    """
+    Midas eq. (6): https://arxiv.org/pdf/1907.01341v3
+    But for pointmaps (see MoGe): https://arxiv.org/pdf/2410.19115
+    """
+    pose_target_convention: str = "ScaleShiftInvariantWTranslationScale"
+    scale_mean = torch.tensor([1.0232692956924438, 1.0232691764831543, 1.0232692956924438]).to(torch.float32)
+    scale_std = torch.tensor([1.3773751258850098, 1.3773752450942993, 1.3773750066757202]).to(torch.float32)
+    translation_mean = torch.tensor([0.003191213821992278, 0.017236359417438507, 0.9401122331619263]).to(torch.float32)
+    translation_std = torch.tensor([1.341888666152954, 0.7665449380874634, 3.175130605697632]).to(torch.float32)
+    @classmethod
+    def from_instance_pose(cls, instance_pose: InstancePose, normalize: bool = False) -> PoseTarget:
+        metric_to_ssi = cls.ssi_to_metric(
+            instance_pose.scene_scale, instance_pose.scene_shift
+        ).inverse()
+        ssi_scale, ssi_rotation, ssi_translation = InstancePose._broadcast_postcompose(
+            scale=instance_pose.instance_scale_l2c,
+            rotation=instance_pose.instance_quaternion_l2c,
+            translation=instance_pose.instance_position_l2c,
+            transform_to_postcompose=metric_to_ssi,
+        )
+        ssi_translation_scale = ssi_translation.norm(dim=-1, keepdim=True)
+        ssi_translation_unit = ssi_translation / ssi_translation_scale.clamp_min(1e-7)
+        return PoseTarget(
+            x_instance_scale=ssi_scale,
+            x_instance_rotation=ssi_rotation,
+            x_instance_translation=ssi_translation_unit,
+            x_scene_scale=instance_pose.scene_scale,
+            x_scene_center=instance_pose.scene_shift,
+            x_translation_scale=ssi_translation_scale,
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_instance_pose(cls, pose_target: PoseTarget, normalize: bool = False) -> InstancePose:
+        scene_scale = pose_target.x_scene_scale
+        scene_shift = pose_target.x_scene_center
+        ssi_to_metric = cls.ssi_to_metric(scene_scale, scene_shift)
+        ins_translation_unit = pose_target.x_instance_translation / pose_target.x_instance_translation.norm(dim=-1, keepdim=True)
+        ins_translation = ins_translation_unit * pose_target.x_translation_scale
+        ins_scale, ins_rotation, ins_translation = InstancePose._broadcast_postcompose(
+            scale=pose_target.x_instance_scale,
+            rotation=pose_target.x_instance_rotation,
+            translation=ins_translation,
+            transform_to_postcompose=ssi_to_metric,
+        )
+        return InstancePose(
+            instance_scale_l2c=ins_scale,
+            instance_position_l2c=ins_translation,
+            instance_quaternion_l2c=ins_rotation,
+            scene_scale=scene_scale,
+            scene_shift=scene_shift,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget) -> InvariantPoseTarget:
+        instance_pose = cls.to_instance_pose(pose_target)
+        return InvariantPoseTarget.from_instance_pose(instance_pose)
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget) -> PoseTarget:
+        instance_pose = InvariantPoseTarget.to_instance_pose(invariant_targets)
+        return cls.from_instance_pose(instance_pose)
+    @classmethod
+    def get_scale_and_shift(cls, pointmap):
+        shift_z = pointmap[..., -1].nanmedian().unsqueeze(0)
+        shift = torch.zeros_like(shift_z.expand(1, 3))
+        shift[..., -1] = shift_z
+        shifted_pointmap = pointmap - shift
+        scale = shifted_pointmap.abs().nanmean().to(shift.device)
+        shift = shift.reshape(3)
+        scale = scale.expand(3)
+        return scale, shift
+    @staticmethod
+    def ssi_to_metric(scale: torch.Tensor, shift: torch.Tensor):
+        if scale.ndim == 1:
+            scale = scale.unsqueeze(0)
+        if shift.ndim == 1:
+            shift = shift.unsqueeze(0)
+        return Transform3d().scale(scale).translate(shift).to(shift.device)
+class DisparitySpace(PoseTargetConvention):
+    pose_target_convention: str = "DisparitySpace"
+    @classmethod
+    def from_instance_pose(cls, instance_pose: InstancePose, normalize: bool = False) -> PoseTarget:
+        # x_instance_scale = orig_scale / scene_scale
+        # x_instance_translation = [x/z, y/z, 0]  / scene_scale
+        # x_translation_scale = z  / scene_scale
+        assert torch.allclose(instance_pose.scene_scale, torch.ones_like(instance_pose.scene_scale))
+        if not instance_pose.scene_shift.ndim == instance_pose.instance_position_l2c.ndim - 1:
+            raise ValueError(f"scene_shift must be (N+1)D and instance_position_l2c must be (N+1)D, but got {instance_pose.scene_shift.ndim} and {instance_pose.instance_position_l2c.ndim}")
+        shift_xy, shift_z_log = instance_pose.scene_shift.unsqueeze(-2).split([2, 1], dim=-1)
+        pose_xy, pose_z = instance_pose.instance_position_l2c.split([2, 1], dim=-1)
+        # Handle batch dimensions properly
+        if shift_xy.ndim < pose_xy.ndim:
+            shift_xy = shift_xy.unsqueeze(-2)
+        pose_xy_scaled = pose_xy / pose_z - shift_xy
+        pose_z_scaled_log = torch.log(pose_z) - shift_z_log
+        x_instance_scale_log = torch.log(instance_pose.instance_scale_l2c) - torch.log(pose_z)
+        x_instance_translation = torch.cat([pose_xy_scaled, torch.zeros_like(pose_z)], dim=-1)
+        x_translation_scale = torch.exp(pose_z_scaled_log)
+        x_instance_scale = torch.exp(x_instance_scale_log)
+        return PoseTarget(
+            x_instance_scale=x_instance_scale,
+            x_instance_translation=x_instance_translation,
+            x_instance_rotation=instance_pose.instance_quaternion_l2c,
+            x_scene_scale=instance_pose.scene_scale,
+            x_scene_center=instance_pose.scene_shift,
+            x_translation_scale=x_translation_scale,
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_instance_pose(cls, pose_target: PoseTarget, normalize: bool = False) -> InstancePose:
+        scene_scale = pose_target.x_scene_scale
+        scene_shift = pose_target.x_scene_center
+        if not pose_target.x_scene_center.ndim == pose_target.x_instance_translation.ndim - 1:
+            raise ValueError(f"x_scene_center must be (N+1)D and x_instance_translation must be (N+1)D, but got {pose_target.x_scene_center.ndim} and {pose_target.x_instance_translation.ndim}")
+        shift_xy, shift_z_log = pose_target.x_scene_center.unsqueeze(-2).split([2, 1], dim=-1)
+        scene_z_scale = torch.exp(shift_z_log)
+        z = pose_target.x_translation_scale
+        ins_translation = pose_target.x_instance_translation.clone()
+        ins_translation[...,2] = 1.0
+        ins_translation[...,:2] = ins_translation[...,:2] + shift_xy
+        ins_translation = ins_translation * z * scene_z_scale
+        ins_scale = pose_target.x_instance_scale * z * scene_z_scale
+        return InstancePose(
+            instance_scale_l2c=ins_scale * scene_scale,
+            instance_position_l2c=ins_translation * scene_scale,
+            instance_quaternion_l2c=pose_target.x_instance_rotation,
+            scene_scale=scene_scale,
+            scene_shift=scene_shift,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget, normalize: bool = False) -> InvariantPoseTarget:
+        instance_pose = cls.to_instance_pose(pose_target, normalize=normalize)
+        return InvariantPoseTarget.from_instance_pose(instance_pose)
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget, normalize: bool = False) -> PoseTarget:
+        instance_pose = InvariantPoseTarget.to_instance_pose(invariant_targets)
+        return cls.from_instance_pose(instance_pose, normalize=normalize)
+class NormalizedSceneScale(PoseTargetConvention):
+    """
+    x_instance_scale and x_translation_scale are normalized to x_scene_scale
+    """
+    pose_target_convention: str = "NormalizedSceneScale"
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget):
+        translation = invariant_targets.t_unit * invariant_targets.t_rel_norm
+        return PoseTarget(
+            x_instance_scale=invariant_targets.s_rel,
+            x_instance_rotation=invariant_targets.q,
+            x_instance_translation=translation,
+            x_scene_scale=invariant_targets.s_scene,
+            x_scene_center=invariant_targets.t_scene_center,
+            x_translation_scale=torch.ones_like(invariant_targets.t_rel_norm),
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget):
+        t_rel_norm = torch.norm(
+            pose_target.x_instance_translation, dim=-1, keepdim=True
+        )
+        return InvariantPoseTarget(
+            s_scene=pose_target.x_scene_scale,
+            s_rel=pose_target.x_instance_scale,
+            q=pose_target.x_instance_rotation,
+            t_unit=pose_target.x_instance_translation / t_rel_norm,
+            t_rel_norm=t_rel_norm,
+            t_scene_center=pose_target.x_scene_center,
+        )
+class Naive(PoseTargetConvention):
+    pose_target_convention: str = "Naive"
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget):
+        s_scene = invariant_targets.s_rel * invariant_targets.s_scene
+        t_scene = invariant_targets.t_unit * invariant_targets.t_rel_norm
+        return PoseTarget(
+            x_instance_scale=s_scene,
+            x_instance_rotation=invariant_targets.q,
+            x_instance_translation=t_scene,
+            x_scene_scale=invariant_targets.s_scene,
+            x_scene_center=invariant_targets.t_scene_center,
+            x_translation_scale=torch.ones_like(invariant_targets.t_rel_norm),
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget):
+        s_scene = pose_target.x_scene_scale
+        t_rel_norm = torch.norm(
+            pose_target.x_instance_translation, dim=-1, keepdim=True
+        )
+        return InvariantPoseTarget(
+            s_scene=s_scene,
+            t_scene_center=pose_target.x_scene_center,
+            s_rel=pose_target.x_instance_scale / s_scene,
+            q=pose_target.x_instance_rotation,
+            t_unit=pose_target.x_instance_translation / t_rel_norm,
+            t_rel_norm=t_rel_norm,
+        )
+class NormalizedSceneScaleAndTranslation(PoseTargetConvention):
+    """
+    x_instance_scale and x_translation_scale are normalized to x_scene_scale
+    x_instance_translation is unit
+    """
+    pose_target_convention: str = "NormalizedSceneScaleAndTranslation"
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget):
+        return PoseTarget(
+            x_instance_scale=invariant_targets.s_rel,
+            x_instance_rotation=invariant_targets.q,
+            x_instance_translation=invariant_targets.t_unit,
+            x_scene_scale=invariant_targets.s_scene,
+            x_scene_center=invariant_targets.t_scene_center,
+            x_translation_scale=invariant_targets.t_rel_norm,
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget):
+        return InvariantPoseTarget(
+            s_scene=pose_target.x_scene_scale,
+            t_scene_center=pose_target.x_scene_center,
+            s_rel=pose_target.x_instance_scale,
+            q=pose_target.x_instance_rotation,
+            t_unit=pose_target.x_instance_translation,
+            t_rel_norm=pose_target.x_translation_scale,
+        )
+class ApparentSize(PoseTargetConvention):
+    pose_target_convention: str = "ApparentSize"
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget):
+        return PoseTarget(
+            x_instance_scale=invariant_targets.s_tilde,
+            x_instance_rotation=invariant_targets.q,
+            x_instance_translation=invariant_targets.t_unit,
+            x_scene_scale=invariant_targets.s_scene,
+            x_scene_center=invariant_targets.t_scene_center,
+            x_translation_scale=invariant_targets.t_rel_norm,
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget):
+        return InvariantPoseTarget(
+            s_scene=pose_target.x_scene_scale,
+            t_scene_center=pose_target.x_scene_center,
+            s_tilde=pose_target.x_instance_scale,
+            q=pose_target.x_instance_rotation,
+            t_unit=pose_target.x_instance_translation,
+            t_rel_norm=pose_target.x_translation_scale,
+        )
+class Identity(PoseTargetConvention):
+    """
+    Identity convention - no transformation applied.
+    Direct passthrough mapping between instance pose and pose target values.
+    This preserves all values including scene_scale and scene_shift.
+    """
+    pose_target_convention: str = "Identity"
+    @classmethod
+    def from_instance_pose(cls, instance_pose: InstancePose) -> PoseTarget:
+        return PoseTarget(
+            x_instance_scale=instance_pose.instance_scale_l2c,
+            x_instance_rotation=instance_pose.instance_quaternion_l2c,
+            x_instance_translation=instance_pose.instance_position_l2c,
+            x_scene_scale=instance_pose.scene_scale,
+            x_scene_center=instance_pose.scene_shift,
+            x_translation_scale=torch.ones_like(instance_pose.instance_scale_l2c)[..., 0].unsqueeze(-1),
+            pose_target_convention=cls.pose_target_convention,
+        )
+    @classmethod
+    def to_instance_pose(cls, pose_target: PoseTarget) -> InstancePose:
+        return InstancePose(
+            instance_scale_l2c=pose_target.x_instance_scale,
+            instance_position_l2c=pose_target.x_instance_translation,
+            instance_quaternion_l2c=pose_target.x_instance_rotation,
+            scene_scale=pose_target.x_scene_scale,
+            scene_shift=pose_target.x_scene_center,
+        )
+    @classmethod
+    def to_invariant(cls, pose_target: PoseTarget) -> InvariantPoseTarget:
+        instance_pose = cls.to_instance_pose(pose_target)
+        return InvariantPoseTarget.from_instance_pose(instance_pose)
+    @classmethod
+    def from_invariant(cls, invariant_targets: InvariantPoseTarget) -> PoseTarget:
+        instance_pose = InvariantPoseTarget.to_instance_pose(invariant_targets)
+        return cls.from_instance_pose(instance_pose)
+class PoseTargetConverter:
+    @staticmethod
+    def pose_target_to_instance_pose(pose_target: PoseTarget, normalize: bool = False) -> InstancePose:
+        _convention_class = globals()[pose_target.pose_target_convention]
+        if _convention_class == ScaleShiftInvariant:
+            return _convention_class.to_instance_pose(pose_target, normalize=normalize)
+        else:
+            return _convention_class.to_instance_pose(pose_target)
+    @staticmethod
+    def instance_pose_to_pose_target(
+        instance_pose: InstancePose, pose_target_convention: str, normalize: bool = False
+    ) -> PoseTarget:
+        _convention_class = globals()[pose_target_convention]
+        if _convention_class == ScaleShiftInvariant:
+            return _convention_class.from_instance_pose(instance_pose, normalize=normalize)
+        else:
+            return _convention_class.from_instance_pose(instance_pose)
+    @staticmethod
+    def dicts_instance_pose_to_pose_target(
+        pose_target_convention: str,
+        **kwargs,
+    ):
+        instance_pose = InstancePose(**kwargs)
+        pose_target = PoseTargetConverter.instance_pose_to_pose_target(
+            instance_pose, pose_target_convention
+        )
+        return asdict(pose_target)
+    @staticmethod
+    def dicts_pose_target_to_instance_pose(
+        **kwargs,
+    ):
+        pose_target_convention = kwargs.get("pose_target_convention")
+        _convention_class = globals()[pose_target_convention]
+        assert (
+            _convention_class.pose_target_convention == pose_target_convention
+        ), f"Normalization name mismatch: {_convention_class.pose_target_convention} != {pose_target_convention}"
+        normalize = kwargs.pop("normalize", False)
+        pose_target = PoseTarget(**kwargs)
+        instance_pose = PoseTargetConverter.pose_target_to_instance_pose(pose_target, normalize)
+        return asdict(instance_pose)
+class LogScaleShiftNormalizer:
+    def __init__(self, shift_log: torch.Tensor = 0.0, scale_log: torch.Tensor = 1.0):
+        self.shift_log = shift_log
+        self.scale_log = scale_log
+    def normalize(self, value: torch.Tensor):
+        return torch.log(value) - self.shift_log / self.scale_log
+    def denormalize(self, value: torch.Tensor):
+        return torch.exp(value * self.scale_log + self.shift_log)

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/preprocessor.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import warnings
+import torch
+from loguru import logger
+from dataclasses import dataclass
+from typing import Callable, Optional
+import warnings
+from .img_and_mask_transforms import (
+    SSIPointmapNormalizer,
+)
+# Load and process data
+@dataclass
+class PreProcessor:
+    """
+    Preprocessor configuration for image, mask, and pointmap transforms.
+    Transform application order:
+    1. Pointmap normalization (if normalize_pointmap=True)
+    2. Joint transforms (img_mask_pointmap_joint_transform or img_mask_joint_transform)
+    3. Individual transforms (img_transform, mask_transform, pointmap_transform)
+    For backward compatibility, img_mask_joint_transform is preserved. When both
+    img_mask_pointmap_joint_transform and img_mask_joint_transform are present,
+    img_mask_pointmap_joint_transform takes priority.
+    """
+    img_transform: Callable = (None,)
+    mask_transform: Callable = (None,)
+    img_mask_joint_transform: list[Callable] = (None,)
+    rgb_img_mask_joint_transform: list[Callable] = (None,)
+    # New fields for pointmap support
+    pointmap_transform: Callable = (None,)
+    img_mask_pointmap_joint_transform: list[Callable] = (None,)
+    # Pointmap normalization option
+    normalize_pointmap: bool = False
+    pointmap_normalizer: Optional[Callable] = None
+    rgb_pointmap_normalizer: Optional[Callable] = None
+    def __post_init__(self):
+        if self.pointmap_normalizer is None:
+            self.pointmap_normalizer = SSIPointmapNormalizer()
+            if self.normalize_pointmap == False:
+                warnings.warn("normalize_pointmap is also set to False, which means we will return the moments but not normalize the pointmap. This supports old unnormalized pointmap models, but this is dangerous behavior.", DeprecationWarning, stacklevel=2)
+        if self.rgb_pointmap_normalizer is None:
+            logger.warning("No rgb pointmap normalizer provided, using scale + shift ")
+            self.rgb_pointmap_normalizer = self.pointmap_normalizer
+    def _normalize_pointmap(
+        self, pointmap: torch.Tensor,
+        mask: torch.Tensor,
+        pointmap_normalizer: Callable,
+        scale: Optional[torch.Tensor] = None,
+        shift: Optional[torch.Tensor] = None,
+    ):
+        if pointmap is None:
+            return pointmap, None, None
+        if self.normalize_pointmap == False:
+            # old behavior: Pose is normalized to the pointmap center, but pointmap is not
+            _, pointmap_scale, pointmap_shift = pointmap_normalizer.normalize(pointmap, mask)
+            return pointmap, pointmap_scale, pointmap_shift
+        if scale is not None or shift is not None:
+            return pointmap_normalizer.normalize(pointmap, mask, scale, shift)
+        return pointmap_normalizer.normalize(pointmap, mask)
+    def _process_image_mask_pointmap_mess(
+        self, rgb_image, rgb_image_mask, pointmap=None
+    ):
+        """Extended version that handles pointmaps"""
+        # Apply pointmap normalization if enabled
+        pointmap_for_crop, pointmap_scale, pointmap_shift = self._normalize_pointmap(
+            pointmap, rgb_image_mask, self.pointmap_normalizer
+        )
+        # Apply transforms to the original full rgb image and mask.
+        rgb_image, rgb_image_mask = self._preprocess_rgb_image_mask(rgb_image, rgb_image_mask)
+        # These two are typically used for getting cropped images of the object
+        #   : first apply joint transforms
+        processed_rgb_image, processed_mask, processed_pointmap = (
+            self._preprocess_image_mask_pointmap(rgb_image, rgb_image_mask, pointmap_for_crop)
+        )
+        #   : then apply individual transforms on top of the joint transforms
+        processed_rgb_image = self._apply_transform(
+            processed_rgb_image, self.img_transform
+        )
+        processed_mask = self._apply_transform(processed_mask, self.mask_transform)
+        if processed_pointmap is not None:
+            processed_pointmap = self._apply_transform(
+                processed_pointmap, self.pointmap_transform
+            )
+        # This version is typically the full version of the image
+        #   : apply individual transforms only
+        rgb_image = self._apply_transform(rgb_image, self.img_transform)
+        rgb_image_mask = self._apply_transform(rgb_image_mask, self.mask_transform)
+        rgb_pointmap, rgb_pointmap_scale, rgb_pointmap_shift = self._normalize_pointmap(
+            pointmap, rgb_image_mask, self.rgb_pointmap_normalizer, pointmap_scale, pointmap_shift
+        )
+        if rgb_pointmap is not None:
+            rgb_pointmap = self._apply_transform(rgb_pointmap, self.pointmap_transform)
+        result = {
+            "mask": processed_mask,
+            "image": processed_rgb_image,
+            "rgb_image": rgb_image,
+            "rgb_image_mask": rgb_image_mask,
+        }
+        # Add pointmap results if available
+        if processed_pointmap is not None:
+            result.update(
+                {
+                    "pointmap": processed_pointmap,
+                    "rgb_pointmap": rgb_pointmap,
+                }
+            )
+        # Add normalization parameters if normalization was applied
+        if pointmap_scale is not None and pointmap_shift is not None:
+            result.update(
+                {
+                    "pointmap_scale": pointmap_scale,
+                    "pointmap_shift": pointmap_shift,
+                    "rgb_pointmap_scale": rgb_pointmap_scale,
+                    "rgb_pointmap_shift": rgb_pointmap_shift,
+                }
+            )
+        return result
+    def _process_image_and_mask_mess(self, rgb_image, rgb_image_mask):
+        """Original method - calls extended version without pointmap"""
+        return self._process_image_mask_pointmap_mess(rgb_image, rgb_image_mask, None)
+    def _preprocess_rgb_image_mask(self, rgb_image: torch.Tensor, rgb_image_mask: torch.Tensor):
+        """Apply joint transforms to rgb_image and rgb_image_mask."""
+        if (
+            self.rgb_img_mask_joint_transform != (None,)
+            and self.rgb_img_mask_joint_transform is not None
+        ):
+            for trans in self.rgb_img_mask_joint_transform:
+                rgb_image, rgb_image_mask = trans(rgb_image, rgb_image_mask)
+        return rgb_image, rgb_image_mask
+    def _preprocess_image_mask_pointmap(self, rgb_image, mask_image, pointmap=None):
+        """Apply joint transforms with priority: triple transforms > dual transforms."""
+        # Priority: img_mask_pointmap_joint_transform when pointmap is provided
+        if (
+            self.img_mask_pointmap_joint_transform != (None,)
+            and self.img_mask_pointmap_joint_transform is not None
+            and pointmap is not None
+        ):
+            for trans in self.img_mask_pointmap_joint_transform:
+                rgb_image, mask_image, pointmap = trans(
+                    rgb_image, mask_image, pointmap=pointmap
+                )
+            return rgb_image, mask_image, pointmap
+        # Fallback: img_mask_joint_transform (existing behavior)
+        elif (
+            self.img_mask_joint_transform != (None,)
+            and self.img_mask_joint_transform is not None
+        ):
+            for trans in self.img_mask_joint_transform:
+                rgb_image, mask_image = trans(rgb_image, mask_image)
+            return rgb_image, mask_image, pointmap
+        return rgb_image, mask_image, pointmap
+    def _preprocess_image_and_mask(self, rgb_image, mask_image):
+        """Backward compatibility wrapper - only applies dual transforms"""
+        rgb_image, mask_image, _ = self._preprocess_image_mask_pointmap(
+            rgb_image, mask_image, None
+        )
+        return rgb_image, mask_image
+    # keep here for backward compatibility
+    def _preprocess_image_and_mask_inference(self, rgb_image, mask_image):
+        warnings.warn(
+            "The _preprocess_image_and_mask_inference is deprecated! Please use _preprocess_image_and_mask",
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
+        return self._preprocess_image_and_mask(rgb_image, mask_image)
+    def _apply_transform(self, input: torch.Tensor, transform):
+        if input is not None and transform is not None and transform != (None,):
+            input = transform(input)
+        return input

thirdparty/sam3d/sam3d/sam3d_objects/data/dataset/tdfy/transforms_3d.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from collections import namedtuple
+import math
+import torch
+from pytorch3d.transforms import (
+    Rotate,
+    Translate,
+    Scale,
+    Transform3d,
+    quaternion_to_matrix,
+    axis_angle_to_quaternion,
+)
+DecomposedTransform = namedtuple(
+    "DecomposedTransform", ["scale", "rotation", "translation"]
+)
+def compose_transform(
+    scale: torch.Tensor, rotation: torch.Tensor, translation: torch.Tensor
+) -> Transform3d:
+    """
+    Args:
+        scale: (..., 3) tensor of scale factors
+        rotation: (..., 3, 3) tensor of rotation matrices
+        translation: (..., 3) tensor of translation vectors
+    """
+    tfm = Transform3d(dtype=scale.dtype, device=scale.device)
+    return tfm.scale(scale).rotate(rotation).translate(translation)
+def decompose_transform(transform: Transform3d) -> DecomposedTransform:
+    """
+    Returns:
+        scale: (..., 3) tensor of scale factors
+        rotation: (..., 3, 3) tensor of rotation matrices
+        translation: (..., 3) tensor of translation vectors
+    """
+    matrices = transform.get_matrix()
+    scale = torch.norm(matrices[:, :3, :3], dim=-1)
+    rotation = matrices[:, :3, :3] / scale.unsqueeze(-1)  # Normalize rotation matrix
+    translation = matrices[:, 3, :3]  # Extract translation vector
+    return DecomposedTransform(scale, rotation, translation)
+def get_rotation_about_x_axis(angle: float = math.pi / 2) -> torch.Tensor:
+    axis = torch.tensor([1.0, 0.0, 0.0])
+    axis_angle = axis * angle
+    return axis_angle_to_quaternion(axis_angle)

thirdparty/sam3d/sam3d/sam3d_objects/data/utils.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Any, Iterable, Tuple, Union, Dict, Sequence, Mapping, Container
+import optree
+import torch
+from collections.abc import Iterable
+import inspect
+import ast
+import astor
+from torch.utils import _pytree
+# None = root, Iterable[Any] = path, Any = path of one
+ChildPathType = Union[None, Iterable[Any], Any]
+ArgsType = Iterable[ChildPathType]
+KwargsType = Mapping[str, ChildPathType]
+ArgsKwargsType = Tuple[ArgsType, KwargsType]
+MappingType = Union[None, ArgsKwargsType, ArgsType, KwargsType]
+def tree_transpose_level_one(
+    structure,
+    check_children=False,
+    map_fn=None,
+    is_leaf=None,
+):
+    _, outer_spec = optree.tree_flatten(
+        structure,
+        is_leaf=lambda x: x is not structure,
+        none_is_leaf=True,
+    )
+    spec = optree.tree_structure(structure, none_is_leaf=True, is_leaf=is_leaf)
+    children_spec = spec.children()
+    if len(children_spec) > 0:
+        inner_spec = children_spec[0]
+        if check_children:
+            for child_spec in children_spec[1:]:
+                assert (
+                    inner_spec == child_spec
+                ), f"one child was found having a different tree structure ({inner_spec} != {child_spec})"
+        structure = optree.tree_transpose(outer_spec, inner_spec, structure)
+    if map_fn is not None:
+        structure = optree.tree_map(
+            map_fn,
+            structure,
+            is_leaf=lambda x: optree.tree_structure(
+                x, is_leaf=is_leaf, none_is_leaf=True
+            )
+            == outer_spec,
+            none_is_leaf=True,
+        )
+    return structure
+@staticmethod
+def tree_tensor_map(fn, tree, *rest):
+    return optree.tree_map(
+        fn,
+        tree,
+        *rest,
+        is_leaf=lambda x: isinstance(x, torch.Tensor),
+        none_is_leaf=False,
+    )
+def to_device(obj, device):
+    """Recursively moves all tensors in obj to the specified device.
+    Args:
+        obj: Object to move to device - can be a tensor, list, tuple, dict or any nested combination
+        device: Target device (e.g. 'cuda', 'cpu', torch.device('cuda:0') etc.)
+    Returns:
+        Same object structure with all contained tensors moved to specified device
+    """
+    to_fn = lambda x: x.to(device)
+    return optree.tree_map(to_fn, obj, is_leaf=torch.is_tensor, none_is_leaf=False)
+def expand_right(tensor, target_shape):
+    """
+    e.g. Takes tensor of (a, b, c) and returns a tensor of (a, b, c, 1, 1, ...)
+    """
+    current_shape = tensor.shape
+    dims_to_add = len(target_shape) - len(current_shape)
+    result = tensor
+    for _ in range(dims_to_add):
+        result = result.unsqueeze(-1)
+    expand_shape = list(current_shape) + [-1] * dims_to_add
+    for i in range(len(target_shape)):
+        if i < len(expand_shape) and expand_shape[i] == -1:
+            expand_shape[i] = target_shape[i]
+    return result.expand(*expand_shape)
+def expand_as_right(tensor, target):
+    return expand_right(tensor, target.shape)
+def as_keys(path: ChildPathType):
+    if isinstance(path, Iterable) and (not isinstance(path, str)):
+        return tuple(path)
+    elif path is None:
+        return ()
+    return (path,)
+def get_child(obj: Any, *keys: Iterable[Any]):
+    for key in keys:
+        obj = obj[key]
+    return obj
+def set_child(obj: Any, value: Any, *keys: Iterable[Any]):
+    parent = None
+    for key in keys:
+        parent = obj
+        obj = obj[key]
+    if parent is None:
+        obj = value
+    else:
+        parent[key] = value
+    return obj
+def build_args_batch_extractor(args_mapping: ArgsType):
+    def extract_fn(batch):
+        return tuple(get_child(batch, *as_keys(path)) for path in args_mapping)
+    return extract_fn
+def build_kwargs_batch_extractor(kwargs_mapping: KwargsType):
+    def extract_fn(batch):
+        return {
+            name: get_child(batch, *as_keys(path))
+            for name, path in kwargs_mapping.items()
+        }
+    return extract_fn
+empty_mapping = object()
+kwargs_identity_mapping = object()
+def build_batch_extractor(mapping: MappingType):
+    extract_args_fn = lambda x: ()
+    extract_kwargs_fn = lambda x: {}
+    if mapping is None:
+        def extract_args_fn(batch):
+            return (batch,)
+    elif mapping is empty_mapping:
+        pass
+    elif mapping is kwargs_identity_mapping:
+        extract_kwargs_fn = lambda x: x
+    elif isinstance(mapping, Sequence) and (not isinstance(mapping, str)):
+        if (
+            len(mapping) == 2
+            and isinstance(mapping[0], Sequence)
+            and isinstance(mapping[1], Dict)
+        ):
+            extract_args_fn = build_args_batch_extractor(mapping[0])
+            extract_kwargs_fn = build_kwargs_batch_extractor(mapping[1])
+        else:
+            extract_args_fn = build_args_batch_extractor(mapping)
+    elif isinstance(mapping, Mapping):
+        extract_kwargs_fn = build_kwargs_batch_extractor(mapping)
+    else:
+        def extract_args_fn(batch):
+            return (get_child(batch, *as_keys(mapping)),)
+    def extract_fn(batch):
+        return extract_args_fn(batch), extract_kwargs_fn(batch)
+    return extract_fn
+# >
+def right_broadcasting(arr, target):
+    return arr.reshape(arr.shape + (1,) * (target.ndim - arr.ndim))
+def get_stats(tensor: torch.Tensor):
+    float_tensor = tensor.float()
+    return {
+        "shape": tuple(tensor.shape),
+        "min": tensor.min().item(),
+        "max": tensor.max().item(),
+        "mean": float_tensor.mean().item(),
+        "median": tensor.median().item(),
+        "std": float_tensor.std().item(),
+    }
+def _get_caller_arg_name(argnum=0, parent_frame=1):
+    try:
+        frame = inspect.currentframe()  # current frame
+        frame = inspect.getouterframes(frame)[1 + parent_frame]  # parent frame
+        code = inspect.getframeinfo(frame[0]).code_context[0].strip()  # get code line
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Call):
+                args = node.args
+                break  # only get the first parent call
+        # get first argument string (do not handle '=')
+        label = astor.to_source(args[argnum]).strip()
+    except:
+        # TODO(Pierre) log exception
+        label = "{label}"
+    return label
+def print_stats(tensor, label=None):
+    if label is None:
+        label = _get_caller_arg_name(argnum=0)
+    stats = get_stats(tensor)
+    string = f"{label}:\n" + "\n".join(f"- {k}: {v}" for k, v in stats.items())
+    print(string)
+def tree_reduce_unique(fn, tree, ensure_unique=True, **kwargs):
+    values = _pytree.tree_flatten(tree, **kwargs)[0]
+    values = tuple(map(fn, values))
+    first = values[0]
+    if ensure_unique:
+        for value in values[1:]:
+            if value != first:
+                raise RuntimeError(
+                    f"different values found, {value} and {first} should be the same"
+                )
+    return first

thirdparty/sam3d/sam3d/sam3d_objects/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/dino.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from typing import Optional, Dict, Any
+import warnings
+from torchvision.transforms import Normalize
+import torch.nn.functional as F
+from loguru import logger
+class Dino(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int = 224,
+        repo_or_dir: str = "facebookresearch/dinov2",
+        dino_model: str = "dinov2_vitb14",
+        source: str = "github",
+        backbone_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        # for backward compatible
+        prenorm_features: bool = False,
+        freeze_backbone: bool = True,
+        prune_network: bool = False,  # False for backward compatible
+    ):
+        super().__init__()
+        if backbone_kwargs is None:
+            backbone_kwargs = {}
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            logger.info(f"Loading DINO model: {dino_model} from {repo_or_dir} (source: {source})")
+            if backbone_kwargs:
+                logger.info(f"DINO backbone kwargs: {backbone_kwargs}")
+            self.backbone = torch.hub.load(
+                repo_or_dir=repo_or_dir,
+                model=dino_model,
+                source=source,
+                verbose=False,
+                **backbone_kwargs,
+            )
+            # Log model properties after loading
+            logger.info(f"Loaded DINO model - type: {type(self.backbone)}, "
+                        f"embed_dim: {self.backbone.embed_dim}, "
+                        f"patch_size: {getattr(self.backbone.patch_embed, 'patch_size', 'N/A')}")
+        self.resize_input_size = (input_size, input_size)
+        self.embed_dim = self.backbone.embed_dim
+        self.input_size = input_size
+        self.input_channels = 3
+        self.normalize_images = normalize_images
+        self.prenorm_features = prenorm_features
+        self.register_buffer('mean', torch.as_tensor([[0.485, 0.456, 0.406]]).view(-1, 1, 1), persistent=False)
+        self.register_buffer('std', torch.as_tensor([[0.229, 0.224, 0.225]]).view(-1, 1, 1), persistent=False)
+        # freeze
+        if freeze_backbone:
+            self.requires_grad_(False)
+            self.eval()
+        elif not prune_network:
+            logger.warning(
+                "Unfreeze encoder w/o prune parameter may lead to error in ddp/fp16 training"
+            )
+        if prune_network:
+            self._prune_network()
+    def _preprocess_input(self, x):
+        _resized_images = torch.nn.functional.interpolate(
+            x,
+            size=self.resize_input_size,
+            mode="bilinear",
+            align_corners=False,
+        )
+        if x.shape[1] == 1:
+            _resized_images = _resized_images.repeat(1, 3, 1, 1)
+        if self.normalize_images:
+            _resized_images = _resized_images.sub_(self.mean).div_(self.std)
+        return _resized_images
+    def _forward_intermediate_layers(
+        self, input_img, intermediate_layers, cls_token=True
+    ):
+        return self.backbone.get_intermediate_layers(
+            input_img,
+            intermediate_layers,
+            return_class_token=cls_token,
+        )
+    def _forward_last_layer(self, input_img):
+        output = self.backbone.forward_features(input_img)
+        if self.prenorm_features:
+            features = output["x_prenorm"]
+            tokens = F.layer_norm(features, features.shape[-1:])
+        else:
+            tokens = torch.cat(
+                [
+                    output["x_norm_clstoken"].unsqueeze(1),
+                    output["x_norm_patchtokens"],
+                ],
+                dim=1,
+            )
+        return tokens
+    def forward(self, x, **kwargs):
+        _resized_images = self._preprocess_input(x)
+        tokens = self._forward_last_layer(_resized_images)
+        return tokens.to(x.dtype)
+    def _prune_network(self):
+        """
+        Ran this script:
+        out = model(input)
+        loss = out.sum()
+        loss.backward()
+        for name, p in dino_model.named_parameters():
+            if p.grad is None:
+                print(name)
+        model.zero_grad()
+        """
+        self.backbone.mask_token = None
+        if self.prenorm_features:
+            self.backbone.norm = torch.nn.Identity()
+class DinoForMasks(torch.nn.Module):
+    def __init__(
+        self,
+        backbone: Dino,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.embed_dim = self.backbone.embed_dim
+    def forward(self, image, mask):
+        return self.backbone.forward(mask)

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/embedder_fuser.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+import torch
+from loguru import logger
+from torch import nn
+from typing import Optional, Tuple, List, Literal, Dict
+from sam3d_objects.model.layers.llama3.ff import FeedForward
+from omegaconf import OmegaConf
+class EmbedderFuser(torch.nn.Module):
+    """
+    Fusing individual condition embedder. Require kwargs for the forward!
+    Args:
+        embedder_list: List of Tuples. Each Tuple consists of a condition_embedder
+            and a list of tuple. In the list, each tuple consists of a string, indicating
+            a kward, and astring, indicating the group of positional encoding to be used.
+        use_pos_embedding: whether to add positional embedding. If add, follow the index in
+            embedder_list. Choices of None (no pos emb), random, and learned.
+        projection_pre_norm: pre-normalize features before feeding into projector layers.
+        projection_net_hidden_dim_multiplier: hidden dimension for projection layer. If 0, don't use.
+    """
+    def __init__(
+        self,
+        embedder_list: List[Tuple[nn.Module, List[Tuple[str, Optional[str]]]]],
+        use_pos_embedding: Optional[Literal["random", "learned"]] = "learned",
+        projection_pre_norm: bool = True,
+        projection_net_hidden_dim_multiplier: float = 4.0,
+        compression_projection_multiplier: float = 0,
+        freeze: bool = False,
+        drop_modalities_weight: Dict[List[str], float] = None,
+        dropout_prob: float = 0.0,
+        force_drop_modalities: List[str] = None,
+    ):
+        super().__init__()
+        # torch.compile does not support OmegaConf.ListConfig, so we convert to a list
+        if not isinstance(embedder_list, List):
+            self.embedder_list = OmegaConf.to_container(embedder_list)
+        else:
+            self.embedder_list = embedder_list
+        self.embed_dims = 0
+        self.compression_projection_multiplier = compression_projection_multiplier
+        self.concate_embed_dims = 0
+        # keep moduleList to be compatible with nn module
+        self.module_list = []
+        max_positional_embed_idx = 0
+        self.positional_embed_map = {}
+        for condition_embedder, kwargs_info in self.embedder_list:
+            self.embed_dims = max(self.embed_dims, condition_embedder.embed_dim)
+            self.module_list.append(condition_embedder)
+            for _, pos_group in kwargs_info:
+                self.concate_embed_dims += condition_embedder.embed_dim
+                if pos_group is not None:
+                    if pos_group not in self.positional_embed_map:
+                        self.positional_embed_map[pos_group] = max_positional_embed_idx
+                        max_positional_embed_idx += 1
+        self.module_list = nn.ModuleList(self.module_list)
+        self.use_pos_embedding = use_pos_embedding
+        if self.use_pos_embedding == "random":
+            idx_emb = torch.randn(max_positional_embed_idx + 1, 1, self.embed_dims)
+            self.register_buffer("idx_emb", idx_emb)
+        elif self.use_pos_embedding == "learned":
+            self.idx_emb = nn.Parameter(
+                torch.empty(max_positional_embed_idx + 1, self.embed_dims)
+            )
+            nn.init.normal_(
+                self.idx_emb, mean=0.0, std=1.0 / math.sqrt(self.embed_dims)
+            )
+        else:
+            raise NotImplementedError(f"Unknown pos embedding {self.use_pos_embedding}")
+        self.projection_pre_norm = projection_pre_norm
+        self.projection_net_hidden_dim_multiplier = projection_net_hidden_dim_multiplier
+        if projection_net_hidden_dim_multiplier > 0:
+            self.projection_nets = []
+            for condition_embedder, _ in self.embedder_list:
+                self.projection_nets.append(
+                    self._make_projection_net(
+                        condition_embedder.embed_dim,
+                        self.embed_dims,
+                        self.projection_net_hidden_dim_multiplier,
+                    )
+                )
+            self.projection_nets = nn.ModuleList(self.projection_nets)
+        if compression_projection_multiplier > 0:
+            self.compression_projector = self._make_projection_net(
+                self.concate_embed_dims,
+                self.embed_dims,
+                self.compression_projection_multiplier,
+            )
+        self.drop_modalities_weight = drop_modalities_weight if drop_modalities_weight is not None else []
+        self.dropout_prob = dropout_prob
+        self.force_drop_modalities = force_drop_modalities
+        if freeze:
+            self.requires_grad_(False)
+            self.eval()
+    def _make_projection_net(
+        self,
+        input_embed_dim,
+        output_embed_dim: int,
+        multiplier: int,
+    ):
+        if self.projection_pre_norm:
+            pre_norm = nn.LayerNorm(input_embed_dim)
+        else:
+            pre_norm = nn.Identity()
+        # Per-token projection + gated activation
+        ff_net = FeedForward(
+            dim=input_embed_dim,
+            hidden_dim=int(multiplier * output_embed_dim),
+            output_dim=output_embed_dim,
+        )
+        return nn.Sequential(pre_norm, ff_net)
+    def _build_dropout_distribution(self, device):
+        """
+        Build the probability distribution for dropout configurations.
+        Returns:
+            dropout_configs: List of sets containing modalities to drop
+            cumsum_weights: Cumulative sum of weights for sampling
+        """
+        dropout_configs = []
+        weights = []
+        # Add no-dropout configuration with remaining probability
+        dropout_configs.append(set())
+        weights.append(1.0 - self.dropout_prob)
+        # Add configured dropout patterns
+        total_dropout_weight = sum(w for _, w in self.drop_modalities_weight)
+        assert total_dropout_weight > 0, "Total dropout weight must be positive when drop_modalities_weight is provided"
+        for modality_list, weight in self.drop_modalities_weight:
+            dropout_configs.append(set(modality_list))
+            # Scale weight by dropout_prob to ensure total probability sums to 1
+            weights.append(self.dropout_prob * weight / total_dropout_weight)
+        # Convert weights to cumulative distribution
+        weights_tensor = torch.tensor(weights, device=device)
+        was_deterministic = torch.are_deterministic_algorithms_enabled()
+        torch.use_deterministic_algorithms(False)
+        cumsum_weights = torch.cumsum(weights_tensor, dim=0)
+        torch.use_deterministic_algorithms(was_deterministic)
+        return dropout_configs, cumsum_weights
+    def _apply_force_drop(self, kwarg_names: List[str], tokens: List[torch.Tensor]):
+        if not self.force_drop_modalities:
+            return tokens
+        force_drop_set = set(self.force_drop_modalities)
+        result_tokens = []
+        for kwarg_name, token_tensor in zip(kwarg_names, tokens):
+            # Create mask: 0 for forced drop, 1 otherwise
+            mask = 0.0 if kwarg_name in force_drop_set else 1.0
+            result_tokens.append(token_tensor * mask)
+        return result_tokens
+    def _dropout_modalities(self, kwarg_names: List[str], tokens: List[torch.Tensor]):
+        # First apply forced drops (deterministic, always applied)
+        tokens = self._apply_force_drop(kwarg_names, tokens)
+        # Then apply probabilistic dropout (only in training)
+        if not self.training or self.dropout_prob <= 0 or not self.drop_modalities_weight:
+            return tokens
+        batch_size = tokens[0].shape[0]
+        device = tokens[0].device
+        # Build dropout configurations and sample which to use per batch element
+        dropout_configs, cumsum_weights = self._build_dropout_distribution(device)
+        rand_vals = torch.rand(batch_size, device=device)
+        # Clamp indices to valid range (handle edge case where rand_val == 1.0)
+        config_indices = torch.searchsorted(cumsum_weights, rand_vals).clamp(max=len(dropout_configs) - 1)
+        # Apply dropout masks with vectorized operations
+        result_tokens = []
+        for kwarg_name, token_tensor in zip(kwarg_names, tokens):
+            # Start with all ones (no dropout)
+            mask = torch.ones(batch_size, dtype=token_tensor.dtype, device=device)
+            # Vectorized mask creation: check all configurations at once
+            for config_idx, modalities_to_drop in enumerate(dropout_configs):
+                if kwarg_name in modalities_to_drop:
+                    # Set mask to 0 for all batch elements using this configuration
+                    mask[config_indices == config_idx] = 0.0
+            # Reshape mask to match token dimensions
+            mask = mask.view([batch_size] + [1] * (token_tensor.ndim - 1))
+            result_tokens.append(token_tensor * mask)
+        return result_tokens
+    def forward(self, *args, **kwargs):
+        tokens = []
+        kwarg_names = []
+        for i, (condition_embedder, kwargs_info) in enumerate(self.embedder_list):
+            # Ideally, we would batch the inputs; but that assumes same-sized inputs
+            for kwarg_name, pos_group in kwargs_info:
+                if kwarg_name not in kwargs:
+                    logger.warning(f"{kwarg_name} not in kwargs to condition embedder!")
+                input_cond = kwargs[kwarg_name]
+                cond_token = condition_embedder(input_cond)
+                if self.projection_net_hidden_dim_multiplier > 0:
+                    cond_token = self.projection_nets[i](cond_token)
+                if pos_group is not None:
+                    pos_idx = self.positional_embed_map[pos_group]
+                    if self.use_pos_embedding == "random":
+                        cond_token += self.idx_emb[pos_idx : pos_idx + 1]
+                    elif self.use_pos_embedding == "learned":
+                        cond_token += self.idx_emb[pos_idx : pos_idx + 1, None]
+                    else:
+                        raise NotImplementedError(
+                            f"Unknown pos embedding {self.use_pos_embedding}"
+                        )
+                tokens.append(cond_token)
+                kwarg_names.append(kwarg_name)
+        # Apply dropout modalities with preserved order
+        tokens = self._dropout_modalities(kwarg_names, tokens)
+        if self.compression_projection_multiplier > 0:
+            tokens = torch.cat(tokens, dim=-1)
+            tokens = self.compression_projector(tokens)
+        else:
+            tokens = torch.cat(tokens, dim=1)
+        return tokens

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/point_remapper.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+import torch.nn as nn
+class PointRemapper(nn.Module):
+    """Handles remapping of 3D point coordinates and their inverse transformations."""
+    VALID_TYPES = ["linear", "sinh", "exp", "sinh_exp", "exp_disparity"]
+    def __init__(self, remap_type: str = "exp"):
+        super().__init__()
+        self.remap_type = remap_type
+        if remap_type not in self.VALID_TYPES:
+            raise ValueError(
+                f"Invalid remap type: {remap_type}. Must be one of {self.VALID_TYPES}"
+            )
+    def forward(self, points: torch.Tensor) -> torch.Tensor:
+        """Apply remapping to point coordinates."""
+        if self.remap_type == "linear":
+            return points
+        elif self.remap_type == "sinh":
+            return torch.asinh(points)
+        elif self.remap_type == "exp":
+            xy_scaled, z_exp = points.split([2, 1], dim=-1)
+            # Use log1p for better numerical stability near zero
+            z = torch.log1p(z_exp)
+            xy = xy_scaled / (1 + z_exp)
+            return torch.cat([xy, z], dim=-1)
+        elif self.remap_type == "exp_disparity":
+            xy_scaled, z_exp = points.split([2, 1], dim=-1)
+            xy = xy_scaled / z_exp
+            z = torch.log(z_exp)
+            return torch.cat([xy, z], dim=-1)
+        elif self.remap_type == "sinh_exp":
+            xy_sinh, z_exp = points.split([2, 1], dim=-1)
+            xy = torch.asinh(xy_sinh)
+            z = torch.log(z_exp.clamp(min=1e-8))
+            return torch.cat([xy, z], dim=-1)
+        else:
+            raise ValueError(f"Unknown remap type: {self.remap_type}")
+    def inverse(self, points: torch.Tensor) -> torch.Tensor:
+        """Apply inverse remapping to recover original point coordinates."""
+        if self.remap_type == "linear":
+            return points
+        elif self.remap_type == "sinh":
+            return torch.sinh(points)
+        elif self.remap_type == "exp":
+            xy, z = points.split([2, 1], dim=-1)
+            # Inverse of log1p is expm1(z) = exp(z) - 1
+            z_exp = torch.expm1(z)
+            # Inverse of xy/(1+z_exp) is xy*(1+z_exp)
+            return torch.cat([xy * (1 + z_exp), z_exp], dim=-1)
+        elif self.remap_type == "exp_disparity":
+            xy, z = points.split([2, 1], dim=-1)
+            z_exp = torch.exp(z)
+            return torch.cat([xy * z_exp, z_exp], dim=-1)
+        elif self.remap_type == "sinh_exp":
+            xy, z = points.split([2, 1], dim=-1)
+            return torch.cat([torch.sinh(xy), torch.exp(z)], dim=-1)
+        else:
+            raise ValueError(f"Unknown remap type: {self.remap_type}")
+    def extra_repr(self) -> str:
+        return f"remap_type='{self.remap_type}'"

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/dit/embedder/pointmap.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from timm.models.vision_transformer import Block
+import torch
+from torch import nn
+import torch.nn.functional as F
+from functools import partial
+from loguru import logger
+from .point_remapper import PointRemapper
+class PointPatchEmbed(nn.Module):
+    """
+    Projects (x,y,z) → D
+    Splits into patches (patch_size x patch_size)
+    Runs a tiny self-attention block inside each window
+    Returns one token per window.
+    """
+    def __init__(
+        self,
+        input_size: int = 256,
+        patch_size: int = 8,
+        embed_dim: int = 768,
+        remap_output: str = "exp",  # Add remap_output parameter
+        dropout_prob: float = 0.0,  # Dropout probability for pointmap
+        force_dropout_always: bool = False,  # Force dropout during validation/inference
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.dropout_prob = dropout_prob
+        self.force_dropout_always = force_dropout_always
+        # Point remapper
+        self.point_remapper = PointRemapper(remap_output)
+        # (1) point embedding
+        self.point_proj = nn.Linear(3, embed_dim)
+        self.invalid_xyz_token = nn.Parameter(torch.zeros(embed_dim))
+        # Special embedding for dropped patches (used during dropout)
+        # Alternative dropout strategies to consider:
+        # 1. Drop all tokens entirely or use a single token only
+        # 2. Different dropout patterns per window
+        # 3. Use dropped_xyz_token/invalid_xyz_token per pixel
+        if dropout_prob > 0:
+            self.dropped_xyz_token = nn.Parameter(torch.zeros(embed_dim))
+        # (2) positional embedding
+        num_patches = input_size // patch_size
+        # For patches
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, embed_dim, num_patches, num_patches)
+        )
+        # For points in a patch
+        self.pos_embed_window = nn.Parameter(
+            torch.zeros(1, 1 + patch_size * patch_size, embed_dim)
+        )
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # (3) within-patch transformer block(s)
+        # From MCC: https://github.com/facebookresearch/MCC/blob/b04c97518360e4fdedfb6f090db7e90d0c2f8ae6/mcc_model.py#L97
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    embed_dim,
+                    num_heads=16,
+                    mlp_ratio=2.0,
+                    qkv_bias=True,
+                    norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                )
+            ]
+        )
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize positional embeddings with small std
+        nn.init.normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.pos_embed_window, std=0.02)
+        nn.init.normal_(self.cls_token, std=0.02)
+        nn.init.normal_(self.invalid_xyz_token, std=0.02)
+        # Initialize dropped pointmap token if dropout is enabled
+        if self.dropout_prob > 0:
+            nn.init.normal_(self.dropped_xyz_token, std=0.02)
+        # Initialize point projection with xavier uniform for better gradient flow
+        # This is crucial since pointmaps can have large value ranges
+        nn.init.xavier_uniform_(self.point_proj.weight, gain=0.02)
+        if self.point_proj.bias is not None:
+            nn.init.constant_(self.point_proj.bias, 0)
+    def _get_pos_embed(self, hw):
+        h, w = hw
+        pos_embed = F.interpolate(
+            self.pos_embed, size=(h, w), mode="bilinear", align_corners=False
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1)  # (B, H, W, C)
+        return pos_embed
+    def resize_input(self, xyz: torch.Tensor) -> torch.Tensor:
+        resized_xyz = F.interpolate(xyz, size=self.input_size, mode="nearest")
+        resized_xyz = resized_xyz.permute(0, 2, 3, 1)  # (B, H, W, C)
+        return resized_xyz
+    def apply_pointmap_dropout(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        Apply dropout to pointmap embeddings.
+        Drops entire pointmap for selected samples during training or when forced.
+        When force_dropout_always is True, always drops pointmap regardless of training mode.
+        """
+        # Check if we should apply dropout
+        should_apply_dropout = (self.training or self.force_dropout_always) and self.dropout_prob > 0
+        if not should_apply_dropout:
+            return embeddings
+        # Check if dropout infrastructure exists
+        if not hasattr(self, 'dropped_xyz_token'):
+            if self.force_dropout_always:
+                raise RuntimeError(
+                    "Cannot force dropout: model was initialized with dropout_prob=0. "
+                    "Re-initialize with dropout_prob > 0 to enable forced dropout."
+                )
+            return embeddings
+        batch_size, n_windows, embed_dim = embeddings.shape
+        # Decide dropout behavior
+        if self.force_dropout_always and not self.training:
+            # When forced during inference, always drop (100% dropout)
+            drop_mask = torch.ones(batch_size, device=embeddings.device, dtype=torch.bool)
+        else:
+            # Normal training dropout - use configured probability
+            drop_mask = torch.rand(batch_size, device=embeddings.device) < self.dropout_prob
+        # Create dropped embedding for all windows - use same token for all patches
+        # Shape: (batch_size, n_windows, embed_dim)
+        dropped_embedding = self.dropped_xyz_token.view(1, 1, embed_dim).expand(batch_size, n_windows, embed_dim)
+        # Add positional embeddings to dropped tokens (same as regular embeddings get)
+        n_windows_h = n_windows_w = int(n_windows ** 0.5)
+        pos_embed_patch = self._get_pos_embed((n_windows_h, n_windows_w)).reshape(
+            1, n_windows, embed_dim
+        )
+        dropped_embedding = dropped_embedding + pos_embed_patch
+        drop_mask_expanded = drop_mask.view(batch_size, 1, 1).expand_as(embeddings)
+        embeddings = torch.where(drop_mask_expanded, dropped_embedding, embeddings)
+        return embeddings
+    @torch._dynamo.disable()
+    def embed_pointmap_windows(
+        self, xyz: torch.Tensor, valid_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """Process pointmap into window embeddings without positional encoding"""
+        with torch.no_grad():
+            xyz = self.resize_input(xyz)
+            if valid_mask is None:
+                valid_mask = xyz.isfinite().all(dim=-1)
+            B, H, W, _ = xyz.shape
+            assert (
+                H % self.patch_size == 0 and W % self.patch_size == 0
+            ), "image must be divisible by patch_size"
+            # (1) Handle NaN values before remapping to prevent propagation
+            xyz_safe = xyz.clone()
+            xyz_safe[~valid_mask] = 0.0  # Set invalid points to 0 before remapping
+            # (1b) remap points to normalize their range
+            xyz_remapped = self.point_remapper(xyz_safe)
+        # (2) project + invalid token
+        x = self.point_proj(xyz_remapped)  # (B,H,W,D)
+        x[~valid_mask] = 0.0  # Stop gradient for invalid points
+        x[~valid_mask] += self.invalid_xyz_token
+        return x, B, H, W
+    def inner_forward(
+        self, x: torch.Tensor, B: int, H: int, W: int
+    ) -> torch.Tensor:
+        x = x.view(
+            B,
+            H // self.patch_size,
+            self.patch_size,
+            W // self.patch_size,
+            self.patch_size,
+            self.embed_dim,
+        )  # (B, hW, wW, ws, ws, D)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous()  # (B, hW, wW, ws, ws, D)
+        x = x.view(-1, self.patch_size * self.patch_size, self.embed_dim)
+        # (4) CLS token that contains the patch information
+        cls_tok = self.cls_token.expand(x.shape[0], -1, -1)
+        toks = torch.cat([cls_tok, x], dim=1)
+        # (5) add positional embedding for window
+        toks = toks + self.pos_embed_window
+        # (6) intra-window attention
+        for blk in self.blocks:
+            toks = blk(toks)
+        # (7) Extract CLS tokens and reshape to (B, n_windows, embed_dim)
+        n_windows_h = H // self.patch_size
+        n_windows_w = W // self.patch_size
+        window_embeddings = toks[:, 0].view(B, n_windows_h * n_windows_w, self.embed_dim)
+        # Add positional embeddings
+        pos_embed_patch = self._get_pos_embed((n_windows_h, n_windows_w)).reshape(
+            1, n_windows_h * n_windows_w, self.embed_dim
+        )
+        out = window_embeddings + pos_embed_patch
+        # Apply dropout if enabled (during training OR when forced)
+        if (self.training or self.force_dropout_always) and self.dropout_prob > 0:
+            out = self.apply_pointmap_dropout(out)
+        return out
+    def forward(
+        self, xyz: torch.Tensor, valid_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        xyz        : (B, 3, H, W) map of (x,y,z) coordinates
+        valid_mask : (B, H, W) boolean - True for valid points (optional)
+        returns: (B, num_windows, D)
+        """
+        # Get window embeddings
+        x, B, H, W = self.embed_pointmap_windows(xyz, valid_mask)
+        return self.inner_forward(x, B, H, W)

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/base.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from typing import Optional, Union
+class Base(torch.nn.Module):
+    def __init__(self, seed_or_generator: Optional[Union[int, torch.Generator]] = None):
+        super().__init__()
+        if isinstance(seed_or_generator, torch.Generator):
+            self.random_generator = seed_or_generator
+        elif isinstance(seed_or_generator, int):
+            self.seed = seed_or_generator
+        elif seed_or_generator is None:
+            self.random_generator = torch.default_generator
+        else:
+            raise RuntimeError(
+                f"cannot use argument of type {type(seed_or_generator)} to set random generator"
+            )
+    @property
+    def seed(self):
+        raise AttributeError(f"Cannot read attribute 'seed'.")
+    @seed.setter
+    def seed(self, value: int):
+        self._random_generator = torch.Generator().manual_seed(value)
+    @property
+    def random_generator(self):
+        return self._random_generator
+    @random_generator.setter
+    def random_generator(self, generator: torch.Generator):
+        self._random_generator = generator
+    def forward(self, x_shape, x_device, *args_conditionals, **kwargs_conditionals):
+        return self.generate(
+            x_shape,
+            x_device,
+            *args_conditionals,
+            **kwargs_conditionals,
+        )
+    def generate(self, x_shape, x_device, *args_conditionals, **kwargs_conditionals):
+        for _, xt, _ in self.generate_iter(
+            x_shape,
+            x_device,
+            *args_conditionals,
+            **kwargs_conditionals,
+        ):
+            pass
+        return xt
+    def generate_iter(
+        self,
+        x_shape,
+        x_device,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        raise NotImplementedError
+    def loss(self, x, *args_conditionals, **kwargs_conditionals):
+        raise NotImplementedError

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/classifier_free_guidance.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from functools import partial
+from numbers import Number
+import torch
+import random
+from torch.utils import _pytree
+from torch.utils._pytree import tree_map_only
+from loguru import logger
+def _zeros_like(struct):
+    def make_zeros(x):
+        if isinstance(x, torch.Tensor):
+            return torch.zeros_like(x)
+        return x
+    return _pytree.tree_map(make_zeros, struct)
+def zero_out(args, kwargs):
+    args = _zeros_like(args)
+    kwargs = _zeros_like(kwargs)
+    return args, kwargs
+def discard(args, kwargs):
+    return (), {}
+def _drop_tensors(struct):
+    """
+    Drop any conditioning that are tensors
+    Not using _pytree since we actually want to throw them instead of keeping them.
+    """
+    if isinstance(struct, dict):
+        return {
+            k: _drop_tensors(v)
+            for k, v in struct.items()
+            if not isinstance(v, torch.Tensor)
+        }
+    elif isinstance(struct, (list, tuple)):
+        filtered = [_drop_tensors(x) for x in struct if not isinstance(x, torch.Tensor)]
+        return tuple(filtered) if isinstance(struct, tuple) else filtered
+    else:
+        return struct
+def drop_tensors(args, kwargs):
+    args = _drop_tensors(args)
+    kwargs = _drop_tensors(kwargs)
+    return args, kwargs
+def add_flag(args, kwargs):
+    kwargs["cfg"] = True
+    return args, kwargs
+class ClassifierFreeGuidance(torch.nn.Module):
+    UNCONDITIONAL_HANDLING_TYPES = {
+        "zeros": zero_out,
+        "discard": discard,
+        "drop_tensors": drop_tensors,
+        "add_flag": add_flag,
+    }
+    def __init__(
+        self,
+        backbone,  # backbone should be a backbone/generator (e.g. DDPM/DDIM/FlowMatching)
+        p_unconditional=0.1,
+        strength=3.0,
+        # "zeros" = set cond tensors to 0,
+        # "discard" = remove cond arguments and let underlying model handle it
+        # "drop_tensors" = drop all tensors but leave non-tensors
+        # "add_flag" = add an argument in kwargs as "cfg" and defer the handling to generator backbone
+        unconditional_handling="zeros",
+        interval=None,  # only perform cfg if t within interval
+    ):
+        super().__init__()
+        if not (
+            unconditional_handling
+            in ClassifierFreeGuidance.UNCONDITIONAL_HANDLING_TYPES
+        ):
+            raise RuntimeError(
+                f"'{unconditional_handling}' is not valid for `unconditional_handling`, should be in {ClassifierFreeGuidance.UNCONDITIONAL_HANDLING_TYPES}"
+            )
+        self.backbone = backbone
+        self.p_unconditional = p_unconditional
+        self.strength = strength
+        self.unconditional_handling = unconditional_handling
+        self.interval = interval
+        self._make_unconditional_args = (
+            ClassifierFreeGuidance.UNCONDITIONAL_HANDLING_TYPES[
+                self.unconditional_handling
+            ]
+        )
+    def _cfg_step_tensor(self, y_cond, y_uncond, strength):
+        return (1 + strength) * y_cond - strength * y_uncond
+    def _cfg_step(self, y_cond, y_uncond, strength):
+        if isinstance(strength, dict):
+            return _pytree.tree_map(self._cfg_step_tensor, y_cond, y_uncond, strength)
+        else:
+            return _pytree.tree_map(partial(self._cfg_step_tensor, strength=strength), y_cond, y_uncond)
+    def inner_forward(self, x, t, is_cond, strength, *args_cond, **kwargs_cond):
+        y_cond = self.backbone(x, t, *args_cond, **kwargs_cond)
+        if is_cond:
+            return y_cond
+        else:
+            args_cond, kwargs_cond = self._make_unconditional_args(
+                args_cond,
+                kwargs_cond,
+            )
+            y_uncond = self.backbone(x, t, *args_cond, **kwargs_cond)
+            return self._cfg_step(y_cond, y_uncond, strength)
+    def forward(self, x, t, *args_cond, **kwargs_cond):
+        # handle case when no conditional arguments are provided
+        if len(args_cond) + len(kwargs_cond) == 0:  # unconditional
+            if self.unconditional_handling != "discard":
+                raise RuntimeError(
+                    f"cannot call `ClassifierFreeGuidance` module without condition"
+                )
+            return self.backbone(x, t)
+        else:  # conditional arguments are provided
+            # training mode
+            if self.training:
+                coin_flip = random.random() < self.p_unconditional
+                if coin_flip:  # unconditional
+                    args_cond, kwargs_cond = self._make_unconditional_args(
+                        args_cond,
+                        kwargs_cond,
+                    )
+                return self.backbone(x, t, *args_cond, **kwargs_cond)
+            else:  # inference mode
+                strength = get_strength(self.strength, self.interval, t)
+                is_cond = not any(x > 0.0 for x in _pytree.tree_flatten(strength)[0])
+                return self.inner_forward(
+                    x, t, is_cond, strength, *args_cond, **kwargs_cond
+                )
+def get_strength(strength, interval, t):
+    if interval is None:
+        return _pytree.tree_map(lambda x: 0.0, strength)
+    # If interval is not a dict (single tuple), broadcast it
+    if not isinstance(interval, dict):
+        return _pytree.tree_map(
+            lambda x: x if interval[0] <= t <= interval[1] else 0.0,
+            strength
+        )
+    return _pytree.tree_map(
+        lambda x, iv: x if iv[0] <= t <= iv[1] else 0.0,
+        strength,
+        interval
+    )
+class PointmapCFG(ClassifierFreeGuidance):
+    def __init__(self, *args, strength_pm=0.0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.strength_pm = strength_pm
+    def _cfg_step_tensor(self, y_cond, y_uncond, y_unpm, strength, strength_pm):
+        # https://arxiv.org/abs/2411.18613
+        return y_cond \
+            + strength_pm * (y_cond - y_unpm) \
+            + strength * (y_unpm - y_uncond)
+    def _cfg_step(self, y_cond, y_uncond, y_pm, strength, strength_pm):
+        if isinstance(strength, dict):
+            return _pytree.tree_map(self._cfg_step_tensor, y_cond, y_uncond, y_pm, strength, strength_pm)
+        else:
+            return _pytree.tree_map(partial(self._cfg_step_tensor, strength=strength, strength_pm=strength_pm), y_cond, y_uncond, y_pm)
+    def inner_forward(self, x, t, is_cond, strength, strength_pm, *args_cond, **kwargs_cond):
+        y_cond = self.backbone(x, t, *args_cond, **kwargs_cond)
+        if is_cond:
+            return y_cond
+        else:
+            force_drop_modalities = self.backbone.condition_embedder.force_drop_modalities
+            self.backbone.condition_embedder.force_drop_modalities = ['pointmap', 'rgb_pointmap']
+            y_pm = self.backbone(x, t, *args_cond, **kwargs_cond)
+            self.backbone.condition_embedder.force_drop_modalities = force_drop_modalities
+            args_cond, kwargs_cond = self._make_unconditional_args(
+                args_cond,
+                kwargs_cond,
+            )
+            y_uncond = self.backbone(x, t, *args_cond, **kwargs_cond)
+            return self._cfg_step(y_cond, y_uncond, y_pm, strength, strength_pm)
+    def forward(self, x, t, *args_cond, **kwargs_cond):
+        # handle case when no conditional arguments are provided
+        if len(args_cond) + len(kwargs_cond) == 0:  # unconditional
+            if self.unconditional_handling != "discard":
+                raise RuntimeError(
+                    f"cannot call `ClassifierFreeGuidance` module without condition"
+                )
+            return self.backbone(x, t)
+        else:  # conditional arguments are provided
+            # training mode
+            if self.training:
+                coin_flip = random.random() < self.p_unconditional
+                if coin_flip:  # unconditional
+                    args_cond, kwargs_cond = self._make_unconditional_args(
+                        args_cond,
+                        kwargs_cond,
+                    )
+                return self.backbone(x, t, *args_cond, **kwargs_cond)
+            else:  # inference mode
+                strength = get_strength(self.strength, self.interval, t)
+                is_cond = not any(x > 0.0 for x in _pytree.tree_flatten(strength)[0])
+                strength_pm = get_strength(self.strength_pm, self.interval, t)
+                return self.inner_forward(
+                    x, t, is_cond, strength, strength_pm, *args_cond, **kwargs_cond
+                )
+class ClassifierFreeGuidanceWithExternalUnconditionalProbability(ClassifierFreeGuidance):
+    def __init__(self, *args, use_unconditional_from_flow_matching=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.use_unconditional_from_flow_matching = use_unconditional_from_flow_matching
+    def forward(self, x, t, *args_cond, p_unconditional=None, **kwargs_cond):
+        # p_unconditional should be a value in [0, 1], indicating the probability of unconditional
+        if p_unconditional is None:
+            coin_flip = random.random() < self.p_unconditional
+        else:
+            coin_flip = random.random() < p_unconditional
+        # handle case when no conditional arguments are provided
+        if len(args_cond) + len(kwargs_cond) == 0:  # unconditional
+            if self.unconditional_handling != "discard":
+                raise RuntimeError(
+                    f"cannot call `ClassifierFreeGuidance` module without condition"
+                )
+            return self.backbone(x, t)
+        else:  # conditional arguments are provided
+            # training mode
+            if self.training:
+                if coin_flip:  # unconditional
+                    args_cond, kwargs_cond = self._make_unconditional_args(
+                        args_cond,
+                        kwargs_cond,
+                    )
+                return self.backbone(x, t, *args_cond, **kwargs_cond)
+            else:  # inference mode
+                strength = get_strength(self.strength, self.interval, t)
+                is_cond = not any(x > 0.0 for x in _pytree.tree_flatten(strength)[0])
+                return self.inner_forward(
+                    x, t, is_cond, strength, *args_cond, **kwargs_cond
+                )

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/model.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Callable, Sequence, Union
+import torch
+import numpy as np
+from functools import partial
+import optree
+import math
+from sam3d_objects.model.backbone.generator.base import Base
+from sam3d_objects.data.utils import right_broadcasting
+from sam3d_objects.data.utils import tree_tensor_map, tree_reduce_unique
+from sam3d_objects.model.backbone.generator.flow_matching.solver import (
+    ODESolver,
+    Euler,
+    Midpoint,
+    RungeKutta4,
+    gradient,
+    SDE,
+)
+# default sampler in flow matching
+uniform_sampler = torch.rand
+# https://arxiv.org/pdf/2403.03206
+def lognorm_sampler(mean=0.0, std=1.0, **kwargs):
+    logit = torch.randn(**kwargs) * std + mean
+    return torch.nn.functional.sigmoid(logit)
+# for backwards compatibility; please do not use this
+def rev_lognorm_sampler(mean=0.0, std=1.0, **kwargs):
+    logit = torch.randn(**kwargs) * std + mean
+    return 1 - torch.nn.functional.sigmoid(logit)
+# https://arxiv.org/pdf/2210.02747
+class FlowMatching(Base):
+    SOLVER_METHODS = {
+        "euler": Euler,
+        "midpoint": Midpoint,
+        "rk4": RungeKutta4,
+        "sde": SDE,
+    }
+    def __init__(
+        self,
+        reverse_fn: Callable,
+        sigma_min: float = 0.0,  # 0. = rectifier flow
+        inference_steps: int = 100,
+        time_scale: float = 1000.0,  # scale [0,1]-time before passing to `reverse_fn`
+        training_time_sampler_fn: Callable = partial(
+            lognorm_sampler,
+            mean=0,
+            std=1,
+        ),
+        reversed_timestamp=False,
+        rescale_t=1.0,
+        loss_fn=partial(torch.nn.functional.mse_loss, reduction="mean"),
+        loss_weights=1.0,
+        solver_method: Union[str, ODESolver] = "euler",
+        solver_kwargs: dict = {},
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.reverse_fn = reverse_fn
+        self.sigma_min = sigma_min
+        self.inference_steps = inference_steps
+        self.time_scale = time_scale
+        self.training_time_sampler_fn = training_time_sampler_fn
+        self.reversed_timestamp = reversed_timestamp
+        self.rescale_t = rescale_t
+        self.loss_fn = loss_fn
+        self.loss_weights = loss_weights
+        self._solver_method, self._solver = self._get_solver(
+            solver_method, solver_kwargs
+        )
+    def _get_solver(self, solver_method, solver_kwargs):
+        if solver_method in FlowMatching.SOLVER_METHODS:
+            solver = FlowMatching.SOLVER_METHODS[solver_method](**solver_kwargs)
+        elif isinstance(solver_method, ODESolver):
+            solver_method = f"custom[{solver_method.__class__.__name__}]"
+            solver = solver_method
+        else:
+            raise ValueError(
+                f"Invalid solver `{solver_method}`, should be in {set(self.SOLVER_METHODS.keys())} or an ODESolver instance"
+            )
+        return solver_method, solver
+    def _generate_noise_tensor(self, x_shape, x_device):
+        return torch.randn(
+            x_shape,
+            # generator=self.random_generator,
+            device=x_device,
+        )
+    def _generate_noise(self, x_shape, x_device):
+        def is_shape(maybe_shape):
+            return isinstance(maybe_shape, Sequence) and all(
+                (isinstance(s, int) and s >= 0) for s in maybe_shape
+            )
+        return optree.tree_map(
+            partial(self._generate_noise_tensor, x_device=x_device),
+            x_shape,
+            is_leaf=is_shape,
+            none_is_leaf=False,
+        )
+    def _generate_x0_tensor(self, x1: torch.Tensor):
+        x0 = self._generate_noise_tensor(x1.shape, x1.device)
+        return x0
+    def _generate_xt_tensor(self, x0: torch.Tensor, x1: torch.Tensor, t: torch.Tensor):
+        # equation (22)
+        tb = right_broadcasting(t.to(x1.device), x1)
+        x_t = (1 - (1 - self.sigma_min) * tb) * x0 + tb * x1
+        return x_t
+    def _generate_target_tensor(self, x0: torch.Tensor, x1: torch.Tensor):
+        # equation (23)
+        target = x1 - (1 - self.sigma_min) * x0
+        return target
+    def _generate_x0(self, x1):
+        return tree_tensor_map(self._generate_x0_tensor, x1)
+    def _generate_xt(self, x0, x1, t):
+        return tree_tensor_map(
+            partial(self._generate_xt_tensor, t=t),
+            x0,
+            x1,
+        )
+    def _generate_target(self, x0, x1):
+        return tree_tensor_map(
+            self._generate_target_tensor,
+            x0,
+            x1,
+        )
+    def _generate_t(self, x1):
+        first_tensor = optree.tree_flatten(x1)[0][0]
+        batch_size = first_tensor.shape[0]
+        device = first_tensor.device
+        t = self.training_time_sampler_fn(
+            size=(batch_size,),
+            generator=self.random_generator,
+        ).to(device)
+        return t
+    def loss(self, x1: torch.Tensor, *args_conditionals, **kwargs_conditionals):
+        t = self._generate_t(x1)
+        x0 = self._generate_x0(x1)
+        x_t = self._generate_xt(x0, x1, t)
+        target = self._generate_target(x0, x1)
+        prediction = self.reverse_fn(
+            x_t,
+            t * self.time_scale,
+            *args_conditionals,
+            **kwargs_conditionals,
+        )
+        # broadcast & and compute loss
+        loss = optree.tree_broadcast_map(
+            lambda fn, weight, pred, targ: weight * fn(pred, targ),
+            self.loss_fn,
+            self.loss_weights,
+            prediction,
+            target,
+        )
+        total_loss = sum(optree.tree_flatten(loss)[0])
+        # Create detailed loss breakdown
+        detail_losses = {
+            "flow_matching_loss": total_loss,
+        }
+        if isinstance(loss, dict):
+            detail_losses.update(loss)
+        return total_loss, detail_losses
+    def _prepare_t(self, steps=None):
+        steps = self.inference_steps if steps is None else steps
+        t_seq = torch.linspace(0, 1, steps + 1)
+        if self.rescale_t:
+            t_seq = t_seq / (1 + (self.rescale_t - 1) * (1 - t_seq))
+        if self.reversed_timestamp:
+            t_seq = 1 - t_seq
+        return t_seq
+    def generate_iter(
+        self,
+        x_shape,
+        x_device,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        x_0 = self._generate_noise(x_shape, x_device)
+        t_seq = self._prepare_t().to(x_device)
+        for x_t, t in self._solver.solve_iter(
+            self._generate_dynamics,
+            x_0,
+            t_seq,
+            *args_conditionals,
+            **kwargs_conditionals,
+        ):
+            yield t, x_t, ()
+    def _generate_dynamics(
+        self,
+        x_t,
+        t,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        return self.reverse_fn(x_t, t * self.time_scale, *args_conditionals, **kwargs_conditionals)
+    def _log_p0(self, x0):
+        x0 = self._tree_flatten(x0)
+        inside_exp = -(x0**2).sum(dim=1) / 2
+        return inside_exp - math.log(2 * math.pi) / 2 * x0.shape[1]
+    def log_likelihood(
+        self,
+        x1,
+        solver=None,
+        steps=None,
+        z_samples=1,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        device = tree_reduce_unique(lambda tensor: tensor.device, x1)
+        # device = "cuda"
+        t_seq = self._prepare_t(steps).to(device)
+        t_seq = 1 - t_seq  # from x1 to x0
+        solver = self._solver if solver is None else self._get_solver(solver)[1]
+        x_0 = solver.solve(
+            partial(self._log_likelihood_dynamics, device=device, z_samples=z_samples),
+            {"x": x1, "log_p": 0.0},
+            t_seq,
+            *args_conditionals,
+            **kwargs_conditionals,
+        )
+        log_p1 = x_0["log_p"] + self._log_p0(x_0["x"])
+        return log_p1
+    def _log_likelihood_dynamics(
+        self,
+        state,
+        t,
+        device,
+        z_samples,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        t = torch.tensor([t * self.time_scale], device=device, dtype=torch.float32)
+        x_t = state["x"]
+        with torch.set_grad_enabled(True):
+            tree_tensor_map(lambda x,: x.requires_grad_(True), x_t)
+            velocity = self.reverse_fn(
+                x_t,
+                t,
+                *args_conditionals,
+                **kwargs_conditionals,
+            )
+            # compute the divergence estimate
+            div = self._compute_hutchinson_divergence(velocity, x_t, z_samples)
+        tree_tensor_map(lambda x,: x.requires_grad_(False), x_t)
+        velocity = tree_tensor_map(lambda x: x.detach(), velocity)
+        return {"x": velocity, "log_p": div.detach()}
+    def _tree_flatten(self, tree):
+        flat_x = tree_tensor_map(lambda x: x.flatten(start_dim=1), tree)
+        flat_x, _ = optree.tree_flatten(
+            flat_x,
+            is_leaf=lambda x: isinstance(x, torch.Tensor),
+        )
+        flat_x = torch.cat(flat_x, dim=1)
+        return flat_x
+    def _compute_hutchinson_divergence(self, velocity, x_t, z_samples):
+        flat_velocity = self._tree_flatten(velocity)
+        flat_velocity = flat_velocity.unsqueeze(-1)
+        z = torch.randn(
+            flat_velocity.shape[:-1] + (z_samples,),
+            dtype=flat_velocity.dtype,
+            device=flat_velocity.device,
+        )
+        z = z < 0
+        z = z * 2.0 - 1.0
+        z = z / math.sqrt(z_samples)
+        # compute Hutchinson divergence estimator E[z^T D_x(vt) z] = E[D_x(z^T vt) z)]
+        vt_dot_z = torch.einsum("ijk,ijk->ik", flat_velocity, z)
+        grad_vt_dot_z = [
+            gradient(vt_dot_z[..., i], x_t, create_graph=(z_samples > 1))
+            for i in range(z_samples)
+        ]
+        grad_vt_dot_z = [self._tree_flatten(g) for g in grad_vt_dot_z]
+        grad_vt_dot_z = torch.stack(grad_vt_dot_z, dim=-1)
+        div = torch.einsum("ijk,ijk->i", grad_vt_dot_z, z)
+        return div
+def _get_device(x):
+    device = tree_reduce_unique(lambda tensor: tensor.device, x)
+    return device
+class ConditionalFlowMatching(FlowMatching):
+    def generate_iter(
+        self,
+        x_shape,
+        x_device,
+        *args_conditionals,
+        **kwargs_conditionals,
+    ):
+        x_0 = self._generate_noise(x_shape, x_device)
+        t_seq = self._prepare_t().to(x_device)
+        noise_override = None
+        if "noise_override" in kwargs_conditionals:
+            noise_override = kwargs_conditionals["noise_override"]
+            del kwargs_conditionals["noise_override"]
+            if noise_override is not None:
+                if type(x_0) == dict:
+                    x_0.update(noise_override)
+                else:
+                    x_0 = noise_override
+        for x_t, t in self._solver.solve_iter(
+            self._generate_dynamics,
+            x_0,
+            t_seq,
+            *args_conditionals,
+            **kwargs_conditionals,
+        ):
+            if noise_override is not None:
+                if type(noise_override) == dict:
+                    x_t.update(noise_override)
+                else:
+                    x_t = noise_override
+            yield t, x_t, ()

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/flow_matching/solver.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import optree
+import torch
+from functools import partial
+from sam3d_objects.data.utils import tree_tensor_map
+def linear_approximation_step(x_t, dt, velocity):
+    # x_tp1 = x_t + velocity * dt
+    x_tp1 = tree_tensor_map(lambda x, v: x + v * dt, x_t, velocity)
+    return x_tp1
+def gradient(output, x, create_graph: bool = False):
+    tensors, pyspec = optree.tree_flatten(
+        x, is_leaf=lambda x: isinstance(x, torch.Tensor)
+    )
+    grad_outputs = [torch.ones_like(output).detach() for _ in tensors]
+    grads = torch.autograd.grad(
+        output,
+        tensors,
+        grad_outputs=grad_outputs,
+        create_graph=create_graph,
+    )
+    return optree.tree_unflatten(pyspec, grads)
+class ODESolver:
+    def step(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        raise NotImplementedError
+    def solve_iter(self, dynamics_fn, x_init, times, *args, **kwargs):
+        x_t = x_init
+        for t0, t1 in zip(times[:-1], times[1:]):
+            dt = t1 - t0
+            x_t = self.step(dynamics_fn, x_t, t0, dt, *args, **kwargs)
+            yield x_t, t0
+    def solve(self, dynamics_fn, x_init, times, *args, **kwargs):
+        for x_t, _ in self.solve_iter(dynamics_fn, x_init, times, *args, **kwargs):
+            pass
+        return x_t
+# https://en.wikipedia.org/wiki/Euler_method
+class Euler(ODESolver):
+    def step(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        velocity = dynamics_fn(x_t, t, *args, **kwargs)
+        x_tp1 = linear_approximation_step(x_t, dt, velocity)
+        return x_tp1
+# https://arxiv.org/abs/2505.05470
+class SDE(ODESolver):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.sde_strength = kwargs.get("sde_strength", 0.1)
+    def step(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        velocity = dynamics_fn(x_t, t, *args, **kwargs)
+        sigma = 1 - t
+        var_t = sigma / (1 - torch.tensor(sigma).clamp(min=dt))
+        std_dev_t = (
+            torch.sqrt(variance) * self.sde_strength
+        )  # self.sde_strength = alpha
+        def compute_mean(x, v):
+            drift_term = x * (std_dev_t**2 / (2 * sigma) * dt)
+            velocity_term = v * (1 + std_dev_t**2 * (1 - sigma) / (2 * sigma)) * dt
+            return x + drift_term + velocity_term
+        prev_sample_mean = tree_tensor_map(compute_mean, x_t, velocity)
+        # Generate noise and compute final sample using tree_tensor_map
+        def add_noise(mean_val):
+            variance_noise = torch.randn_like(mean_val)
+            return mean_val + std_dev_t * torch.sqrt(torch.tensor(dt)) * variance_noise
+        prev_sample = tree_tensor_map(add_noise, prev_sample_mean)
+        return prev_sample
+# https://en.wikipedia.org/wiki/Midpoint_method
+class Midpoint(ODESolver):
+    def step(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        half_dt = 0.5 * dt
+        x_mid = Euler.step(self, dynamics_fn, x_t, t, half_dt, *args, **kwargs)
+        velocity_mid = dynamics_fn(x_mid, t + half_dt, *args, **kwargs)
+        x_tp1 = linear_approximation_step(x_t, dt, velocity_mid)
+        return x_tp1
+# https://en.wikipedia.org/wiki/Runge%E2%80%93Kutta_methods
+class RungeKutta4(ODESolver):
+    def k1(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        return dynamics_fn(x_t, t, *args, **kwargs)
+    def k2(self, dynamics_fn, x_t, t, dt, k1, *args, **kwargs):
+        x_k1 = linear_approximation_step(x_t, dt * 0.5, k1)
+        return dynamics_fn(x_k1, t + dt * 0.5, *args, **kwargs)
+    def k3(self, dynamics_fn, x_t, t, dt, k2, *args, **kwargs):
+        x_k2 = linear_approximation_step(x_t, dt * 0.5, k2)
+        return dynamics_fn(x_k2, t + dt * 0.5, *args, **kwargs)
+    def k4(self, dynamics_fn, x_t, t, dt, k3, *args, **kwargs):
+        x_k3 = linear_approximation_step(x_t, dt, k3)
+        return dynamics_fn(x_k3, t + dt, *args, **kwargs)
+    def step(self, dynamics_fn, x_t, t, dt, *args, **kwargs):
+        k1 = self.k1(dynamics_fn, x_t, t, dt, *args, **kwargs)
+        k2 = self.k2(dynamics_fn, x_t, t, dt, k1, *args, **kwargs)
+        k3 = self.k3(dynamics_fn, x_t, t, dt, k2, *args, **kwargs)
+        k4 = self.k4(dynamics_fn, x_t, t, dt, k3, *args, **kwargs)
+        def compute_velocity(k1, k2, k3, k4):
+            return (k1 + 2 * k2 + 2 * k3 + k4) / 6
+        velocity_k = tree_tensor_map(compute_velocity, k1, k2, k3, k4)
+        x_tp1 = linear_approximation_step(x_t, dt, velocity_k)
+        return x_tp1

thirdparty/sam3d/sam3d/sam3d_objects/model/backbone/generator/shortcut/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.