pytorch
127 строк · 4.3 Кб
1# Owner(s): ["oncall: distributed"]
2
3import sys
4
5import torch
6
7from torch.distributed._shard.sharded_tensor import (
8Shard,
9ShardedTensor,
10ShardedTensorMetadata,
11ShardMetadata,
12)
13from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
14from torch.distributed.checkpoint.metadata import MetadataIndex
15from torch.distributed.checkpoint.utils import find_state_dict_object
16
17from torch.testing._internal.common_utils import (
18run_tests,
19TEST_WITH_DEV_DBG_ASAN,
20TestCase,
21)
22from torch.testing._internal.distributed.distributed_utils import with_fake_comms
23
24if TEST_WITH_DEV_DBG_ASAN:
25print(
26"Skip dev-asan as torch + multiprocessing spawn have known issues",
27file=sys.stderr,
28)
29sys.exit(0)
30
31
32def create_sharded_tensor(rank, world_size, shards_per_rank):
33shards_metadata = []
34local_shards = []
35for idx in range(0, world_size * shards_per_rank):
36shard_rank = idx // shards_per_rank
37shard_md = ShardMetadata(
38shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
39)
40shards_metadata.append(shard_md)
41if shard_rank == rank:
42shard = Shard.from_tensor_and_offsets(
43torch.rand(*shard_md.shard_sizes),
44shard_offsets=shard_md.shard_offsets,
45rank=rank,
46)
47local_shards.append(shard)
48
49sharded_tensor_md = ShardedTensorMetadata(
50shards_metadata=shards_metadata,
51size=torch.Size([8 * len(shards_metadata)]),
52tensor_properties=TensorProperties.create_from_tensor(torch.zeros(1)),
53)
54
55return ShardedTensor._init_from_local_shards_and_global_metadata(
56local_shards=local_shards, sharded_tensor_metadata=sharded_tensor_md
57)
58
59
60class TestMedatadaIndex(TestCase):
61def test_init_convert_offset(self):
62a = MetadataIndex("foo", [1, 2])
63b = MetadataIndex("foo", torch.Size([1, 2]))
64self.assertEqual(a, b)
65
66def test_index_hint_ignored_on_equals(self):
67a = MetadataIndex("foo")
68b = MetadataIndex("foo", index=99)
69self.assertEqual(a, b)
70
71def test_index_hint_ignored_on_hash(self):
72a = MetadataIndex("foo")
73b = MetadataIndex("foo", index=99)
74self.assertEqual(hash(a), hash(b))
75
76def test_flat_data(self):
77state_dict = {
78"a": torch.rand(10),
79"b": [1, 2, 3],
80}
81
82a = find_state_dict_object(state_dict, MetadataIndex("a"))
83self.assertEqual(a, state_dict["a"])
84a = find_state_dict_object(state_dict, MetadataIndex("a", [0]))
85self.assertEqual(a, state_dict["a"])
86a = find_state_dict_object(state_dict, MetadataIndex("a", index=99))
87self.assertEqual(a, state_dict["a"])
88
89b = find_state_dict_object(state_dict, MetadataIndex("b"))
90self.assertEqual(b, state_dict["b"])
91b = find_state_dict_object(state_dict, MetadataIndex("b", index=1))
92self.assertEqual(b, state_dict["b"])
93
94with self.assertRaisesRegex(ValueError, "FQN"):
95find_state_dict_object(state_dict, MetadataIndex("c"))
96with self.assertRaisesRegex(ValueError, "ShardedTensor"):
97find_state_dict_object(state_dict, MetadataIndex("b", [1]))
98
99@with_fake_comms(rank=0, world_size=2)
100def test_sharded_tensor_lookup(self):
101st = create_sharded_tensor(rank=0, world_size=2, shards_per_rank=3)
102state_dict = {"st": st}
103
104obj = find_state_dict_object(state_dict, MetadataIndex("st", [8]))
105self.assertEqual(obj, st.local_shards()[1].tensor)
106
107# good hint
108obj = find_state_dict_object(state_dict, MetadataIndex("st", [8], index=1))
109self.assertEqual(obj, st.local_shards()[1].tensor)
110
111# bad hint
112obj = find_state_dict_object(state_dict, MetadataIndex("st", [8], index=2))
113self.assertEqual(obj, st.local_shards()[1].tensor)
114
115# broken hint
116obj = find_state_dict_object(state_dict, MetadataIndex("st", [8], index=99))
117self.assertEqual(obj, st.local_shards()[1].tensor)
118
119with self.assertRaisesRegex(ValueError, "no offset was provided"):
120find_state_dict_object(state_dict, MetadataIndex("st"))
121
122with self.assertRaisesRegex(ValueError, "Could not find shard"):
123find_state_dict_object(state_dict, MetadataIndex("st", [1]))
124
125
126if __name__ == "__main__":
127run_tests()
128