| |
| """ |
| Check original Dolci dataset format to understand the structure. |
| """ |
|
|
| from datasets import load_dataset |
| import json |
|
|
| def main(): |
| print("Loading dataset...") |
| dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train") |
|
|
| |
| start_idx = len(dataset) - 20000 |
| sample_indices = [start_idx, start_idx + 100, start_idx + 500] |
|
|
| for idx in sample_indices: |
| print(f"\n{'='*60}") |
| print(f"Sample {idx}:") |
| print(f"{'='*60}") |
|
|
| sample = dataset[idx] |
| messages = sample['messages'] |
|
|
| print(f"\nTotal messages: {len(messages)}") |
|
|
| for i, msg in enumerate(messages): |
| role = msg.get('role', '') |
| content = msg.get('content', '') |
| function_calls = msg.get('function_calls') |
| functions = msg.get('functions') |
|
|
| print(f"\n--- Message {i} ---") |
| print(f"Role: {role}") |
| if content: |
| print(f"Content: {content[:200]}{'...' if len(content) > 200 else ''}") |
| if function_calls: |
| print(f"Function calls: {function_calls[:200]}{'...' if len(function_calls) > 200 else ''}") |
| if functions: |
| print(f"Has functions: True (length: {len(functions)})") |
|
|
| if __name__ == "__main__": |
| main() |
|
|