demo-ml-pennfudanped

train_model.ipynb
474 строки · 29.3 Кб
Перенос по словам
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 76,
6
   "id": "0df75687",
7
   "metadata": {
8
    "cellId": "jziuwa87tkxdnpvgjqd3q",
9
    "collapsed": true,
10
    "jupyter": {
11
     "outputs_hidden": true
12
    }
13
   },
14
   "outputs": [
15
    {
16
     "name": "stdout",
17
     "output_type": "stream",
18
     "text": [
19
      "Defaulting to user installation because normal site-packages is not writeable\n",
20
      "Requirement already satisfied: pycocotools in /home/jupyter/.local/lib/python3.8/site-packages (2.0.5)\n",
21
      "Requirement already satisfied: matplotlib>=2.1.0 in /kernel/lib/python3.8/site-packages (from pycocotools) (3.3.3)\n",
22
      "Requirement already satisfied: numpy in /kernel/fallback/lib/python3.8/site-packages (from pycocotools) (1.19.4)\n",
23
      "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /kernel/lib/python3.8/site-packages (from matplotlib>=2.1.0->pycocotools) (2.4.7)\n",
24
      "Requirement already satisfied: pillow>=6.2.0 in /kernel/lib/python3.8/site-packages (from matplotlib>=2.1.0->pycocotools) (9.2.0)\n",
25
      "Requirement already satisfied: kiwisolver>=1.0.1 in /kernel/lib/python3.8/site-packages (from matplotlib>=2.1.0->pycocotools) (1.4.4)\n",
26
      "Requirement already satisfied: python-dateutil>=2.1 in /kernel/lib/python3.8/site-packages (from matplotlib>=2.1.0->pycocotools) (2.8.2)\n",
27
      "Requirement already satisfied: cycler>=0.10 in /kernel/lib/python3.8/site-packages (from matplotlib>=2.1.0->pycocotools) (0.11.0)\n",
28
      "Requirement already satisfied: six>=1.5 in /kernel/lib/python3.8/site-packages (from python-dateutil>=2.1->matplotlib>=2.1.0->pycocotools) (1.16.0)\n",
29
      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.3 is available.\n",
30
      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
31
      "Defaulting to user installation because normal site-packages is not writeable\n",
32
      "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (4.50.0)\n",
33
      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.3 is available.\n",
34
      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
35
      "Defaulting to user installation because normal site-packages is not writeable\n",
36
      "Requirement already satisfied: torchvision in /usr/local/lib/python3.8/dist-packages (0.10.1+cu111)\n",
37
      "Collecting torchvision\n",
38
      "  Downloading torchvision-0.13.1-cp38-cp38-manylinux1_x86_64.whl (19.1 MB)\n",
39
      "     |████████████████████████████████| 19.1 MB 1.7 MB/s            \n",
40
      "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torchvision) (3.7.4.3)\n",
41
      "Collecting torch==1.12.1\n",
42
      "  Downloading torch-1.12.1-cp38-cp38-manylinux1_x86_64.whl (776.3 MB)\n",
43
      "     |████████████████████████████████| 776.3 MB 372 bytes/s           \n",
44
      "\u001b[?25hRequirement already satisfied: requests in /kernel/lib/python3.8/site-packages (from torchvision) (2.25.1)\n",
45
      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /kernel/lib/python3.8/site-packages (from torchvision) (9.2.0)\n",
46
      "Requirement already satisfied: numpy in /kernel/fallback/lib/python3.8/site-packages (from torchvision) (1.19.4)\n",
47
      "Requirement already satisfied: idna<3,>=2.5 in /kernel/lib/python3.8/site-packages (from requests->torchvision) (2.10)\n",
48
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /kernel/lib/python3.8/site-packages (from requests->torchvision) (1.26.12)\n",
49
      "Requirement already satisfied: chardet<5,>=3.0.2 in /kernel/lib/python3.8/site-packages (from requests->torchvision) (4.0.0)\n",
50
      "Requirement already satisfied: certifi>=2017.4.17 in /kernel/lib/python3.8/site-packages (from requests->torchvision) (2022.9.24)\n",
51
      "Installing collected packages: torch, torchvision\n",
52
      "\u001b[33m  WARNING: The scripts convert-caffe2-to-onnx, convert-onnx-to-caffe2 and torchrun are installed in '/home/jupyter/.local/bin' which is not on PATH.\n",
53
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
54
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
55
      "torchaudio 0.9.1 requires torch==1.9.1, but you have torch 1.12.1 which is incompatible.\u001b[0m\n",
56
      "Successfully installed torch-1.12.1 torchvision-0.13.1\n",
57
      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.3 is available.\n",
58
      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n",
59
      "Defaulting to user installation because normal site-packages is not writeable\n",
60
      "Requirement already satisfied: torch in /home/jupyter/.local/lib/python3.8/site-packages (1.12.1)\n",
61
      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (3.7.4.3)\n",
62
      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.3 is available.\n",
63
      "You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
64
     ]
65
    }
66
   ],
67
   "source": [
68
    "# %pip install pycocotools\n",
69
    "# %pip install tqdm\n",
70
    "# %pip install torchvision -U\n",
71
    "# %pip install torch -U"
72
   ]
73
  },
74
  {
75
   "cell_type": "code",
76
   "execution_count": 13,
77
   "id": "e8527c6e",
78
   "metadata": {
79
    "cellId": "ech9jthm9zrjuq7fn2bs"
80
   },
81
   "outputs": [],
82
   "source": [
83
    "from tqdm import tqdm\n",
84
    "import torch\n",
85
    "import torchvision\n",
86
    "from torch.utils.data import DataLoader\n",
87
    "from masks_for_mask_r_cnn_dataset import MasksForMaskRCNNDataset\n",
88
    "from custom_segmentation_transforms import Compose, ToTensor, RandomHorizontalFlip\n",
89
    "from image_utils import show_image, build_box_masks, merge_image_and_masks_boxes, merge_masks_with_colors\n",
90
    "from torch.utils.data import random_split\n",
91
    "from maskrcnn_model import build_maskrsnn_model\n",
92
    "from metrics import accumulate_metrics, compute_metrics, coco_metric_names"
93
   ]
94
  },
95
  {
96
   "cell_type": "code",
97
   "execution_count": 14,
98
   "id": "57636714",
99
   "metadata": {
100
    "cellId": "zamp0lyvjrjlqyekmhu2am"
101
   },
102
   "outputs": [],
103
   "source": [
104
    "ROOT = '/home/jupyter/mnt/s3'\n",
105
    "DS_ROOT = f'{ROOT}/pennfudanped'\n",
106
    "DS_MASKS = f'{DS_ROOT}/PedMasks'\n",
107
    "DS_IMAGES = f'{DS_ROOT}/PNGImages'\n",
108
    "\n",
109
    "PARAMS = {\n",
110
    "    'batch_size': 1,\n",
111
    "    'epochs': 1,\n",
112
    "    'lr': 0.001,\n",
113
    "    'momentum': 0.9,\n",
114
    "    'weight_decay': 0.0005,\n",
115
    "    'step_size': 3,\n",
116
    "    'gamma': 0.1,\n",
117
    "    'pretrained': torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights.DEFAULT\n",
118
    "}"
119
   ]
120
  },
121
  {
122
   "cell_type": "code",
123
   "execution_count": 15,
124
   "id": "02cfedc8",
125
   "metadata": {
126
    "cellId": "0opprkt2dlxsvv4kuogazah"
127
   },
128
   "outputs": [
129
    {
130
     "name": "stdout",
131
     "output_type": "stream",
132
     "text": [
133
      "num_of_targets = 1\n"
134
     ]
135
    }
136
   ],
137
   "source": [
138
    "full_dataset = MasksForMaskRCNNDataset(\n",
139
    "    images_root=DS_IMAGES,\n",
140
    "    masks_root=DS_MASKS,\n",
141
    "    transforms=Compose([\n",
142
    "        ToTensor(),\n",
143
    "        RandomHorizontalFlip(0.5),\n",
144
    "    ])\n",
145
    ")\n",
146
    "num_of_targets = 1\n",
147
    "print(f'num_of_targets = {num_of_targets}')"
148
   ]
149
  },
150
  {
151
   "cell_type": "code",
152
   "execution_count": 16,
153
   "id": "142cf22a",
154
   "metadata": {
155
    "cellId": "eam1tkh4we33jiixj8t46"
156
   },
157
   "outputs": [],
158
   "source": [
159
    "train_size = int(0.8 * len(full_dataset))\n",
160
    "val_size = len(full_dataset) - train_size\n",
161
    "\n",
162
    "dataset_train, dataset_val = random_split(full_dataset,\n",
163
    "                                          [train_size, val_size],\n",
164
    "                                          generator=torch.Generator().manual_seed(0))"
165
   ]
166
  },
167
  {
168
   "cell_type": "code",
169
   "execution_count": 17,
170
   "id": "839f0590",
171
   "metadata": {
172
    "cellId": "47b0jkr0hu7uznuewj20p"
173
   },
174
   "outputs": [],
175
   "source": [
176
    "train_loader = DataLoader(\n",
177
    "    dataset_train,\n",
178
    "    batch_size=PARAMS['batch_size'],\n",
179
    "    shuffle=True,\n",
180
    "    pin_memory=True,\n",
181
    "    drop_last=True\n",
182
    ")\n",
183
    "val_loader = DataLoader(\n",
184
    "    dataset_val,\n",
185
    "    batch_size=PARAMS['batch_size'],\n",
186
    "    shuffle=True,\n",
187
    "    pin_memory=True,\n",
188
    "    drop_last=True\n",
189
    ")"
190
   ]
191
  },
192
  {
193
   "cell_type": "code",
194
   "execution_count": 18,
195
   "id": "c46c38ac",
196
   "metadata": {
197
    "cellId": "tii3mrngw7gi0heqnjzir"
198
   },
199
   "outputs": [
200
    {
201
     "name": "stdout",
202
     "output_type": "stream",
203
     "text": [
204
      "torch.Size([1, 3, 383, 456])\n",
205
      "torch.Size([1, 1, 4])\n",
206
      "torch.Size([1, 1])\n",
207
      "torch.Size([1, 1])\n",
208
      "torch.Size([1, 1])\n",
209
      "torch.Size([1, 1])\n"
210
     ]
211
    }
212
   ],
213
   "source": [
214
    "image, targets = next(iter(train_loader))\n",
215
    "\n",
216
    "print( image.shape )\n",
217
    "print( targets['boxes'].shape )\n",
218
    "print( targets['labels'].shape )\n",
219
    "print( targets['image_id'].shape )\n",
220
    "print( targets['area'].shape )\n",
221
    "print( targets['iscrowd'].shape )"
222
   ]
223
  },
224
  {
225
   "cell_type": "code",
226
   "execution_count": 23,
227
   "id": "efa59496",
228
   "metadata": {
229
    "cellId": "3nww1emr9a6n4r82fa53vj"
230
   },
231
   "outputs": [],
232
   "source": [
233
    "model = build_maskrsnn_model(num_of_targets, PARAMS['pretrained'])\n",
234
    "\n",
235
    "params = [p for p in model.parameters() if p.requires_grad]\n",
236
    "optimizer = torch.optim.SGD(params, lr=PARAMS['lr'], momentum=PARAMS['momentum'], weight_decay=PARAMS['weight_decay'])\n",
237
    "lr_scheduler_global = torch.optim.lr_scheduler.StepLR(optimizer, step_size=PARAMS['step_size'], gamma=PARAMS['gamma'])\n",
238
    "\n",
239
    "epoch_start = 0\n",
240
    "best_accuracy = 0\n",
241
    "best_state = None"
242
   ]
243
  },
244
  {
245
   "cell_type": "code",
246
   "execution_count": 20,
247
   "id": "83f5867f",
248
   "metadata": {
249
    "cellId": "hmz9wi2duwu11ihuuu8m9u"
250
   },
251
   "outputs": [
252
    {
253
     "name": "stdout",
254
     "output_type": "stream",
255
     "text": [
256
      "device = cuda\n"
257
     ]
258
    },
259
    {
260
     "name": "stderr",
261
     "output_type": "stream",
262
     "text": [
263
      "Downloading: \"https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth\" to /tmp/xdg_cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth\n"
264
     ]
265
    },
266
    {
267
     "data": {
268
      "application/vnd.jupyter.widget-view+json": {
269
       "model_id": "979c05f7b1334fad87c241ff5f61e8c8",
270
       "version_major": 2,
271
       "version_minor": 0
272
      },
273
      "text/plain": [
274
       "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=178090079.0), HTML(value='')))"
275
      ]
276
     },
277
     "metadata": {},
278
     "output_type": "display_data"
279
    },
280
    {
281
     "name": "stdout",
282
     "output_type": "stream",
283
     "text": [
284
      "\n"
285
     ]
286
    },
287
    {
288
     "name": "stderr",
289
     "output_type": "stream",
290
     "text": [
291
      "  1%|          | 1/136 [00:05<12:16,  5.46s/it]../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
292
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
293
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
294
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [9,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
295
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [11,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
296
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [14,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
297
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [18,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
298
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [20,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
299
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [22,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
300
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [23,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
301
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [25,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
302
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [26,0,0] Assertion `t >= 0 && t < n_classes` failed.\n",
303
      "../aten/src/ATen/native/cuda/Loss.cu:271: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [30,0,0] Assertion `t >= 0 && t < n_classes` failed.\n"
304
     ]
305
    },
306
    {
307
     "ename": "RuntimeError",
308
     "evalue": "CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.",
309
     "output_type": "error",
310
     "traceback": [
311
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
312
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
313
      "\u001b[0;32m<ipython-input-1-ab096cc10900>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     51\u001b[0m         \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     52\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mamp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautocast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menabled\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscaler\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m             \u001b[0mloss_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     54\u001b[0m             \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mloss_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
314
      "\u001b[0;32m~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1128\u001b[0m         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1131\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1132\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
315
      "\u001b[0;32m~/.local/lib/python3.8/site-packages/torchvision/models/detection/generalized_rcnn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, images, targets)\u001b[0m\n\u001b[1;32m    103\u001b[0m             \u001b[0mfeatures\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOrderedDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"0\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    104\u001b[0m         \u001b[0mproposals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproposal_losses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrpn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m         \u001b[0mdetections\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdetector_losses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroi_heads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproposals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mimages\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimage_sizes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    106\u001b[0m         \u001b[0mdetections\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpostprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdetections\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mimages\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimage_sizes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moriginal_image_sizes\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[operator]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    107\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
316
      "\u001b[0;32m~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1128\u001b[0m         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m                 or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1131\u001b[0m         \u001b[0;31m# Do not call functions when jit is used\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1132\u001b[0m         \u001b[0mfull_backward_hooks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnon_full_backward_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
317
      "\u001b[0;32m~/.local/lib/python3.8/site-packages/torchvision/models/detection/roi_heads.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, features, proposals, image_shapes, targets)\u001b[0m\n\u001b[1;32m    770\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mregression_targets\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    771\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"regression_targets cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 772\u001b[0;31m             \u001b[0mloss_classifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss_box_reg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfastrcnn_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclass_logits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbox_regression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mregression_targets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    773\u001b[0m             \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"loss_classifier\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mloss_classifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"loss_box_reg\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mloss_box_reg\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    774\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
318
      "\u001b[0;32m~/.local/lib/python3.8/site-packages/torchvision/models/detection/roi_heads.py\u001b[0m in \u001b[0;36mfastrcnn_loss\u001b[0;34m(class_logits, box_regression, labels, regression_targets)\u001b[0m\n\u001b[1;32m     34\u001b[0m     \u001b[0;31m# the corresponding ground truth labels, to be used with\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     35\u001b[0m     \u001b[0;31m# advanced indexing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 36\u001b[0;31m     \u001b[0msampled_pos_inds_subset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwhere\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     37\u001b[0m     \u001b[0mlabels_pos\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msampled_pos_inds_subset\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     38\u001b[0m     \u001b[0mN\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_classes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclass_logits\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
319
      "\u001b[0;31mRuntimeError\u001b[0m: CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1."
320
     ]
321
    }
322
   ],
323
   "source": [
324
    "#!g2.mig\n",
325
    "\n",
326
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
327
    "print( f'device = {device}' )\n",
328
    "\n",
329
    "model.to(device)\n",
330
    "model.eval()\n",
331
    "\n",
332
    "for epoch in range(PARAMS['epochs']):\n",
333
    "    pbar = tqdm(total=len(train_loader.dataset))\n",
334
    "\n",
335
    "    acc_loss_value = 0\n",
336
    "    acc_loss_classifier = 0\n",
337
    "    acc_loss_box_reg = 0\n",
338
    "    acc_loss_mask = 0\n",
339
    "    acc_loss_objectness = 0\n",
340
    "    acc_loss_rpn_box_reg = 0\n",
341
    "\n",
342
    "    model.train()\n",
343
    "    scaler = None\n",
344
    "    lr_scheduler = None\n",
345
    "    if epoch == 0:\n",
346
    "        warmup_factor = 1.0 / 1000\n",
347
    "        warmup_iters = min(1000, len(train_loader) - 1)\n",
348
    "        lr_scheduler = torch.optim.lr_scheduler.LinearLR(\n",
349
    "            optimizer, start_factor=warmup_factor, total_iters=warmup_iters\n",
350
    "        )\n",
351
    "    for images, targets in train_loader:\n",
352
    "        pbar.update(len(images))\n",
353
    "\n",
354
    "        images = list(image.to(device) for image in images)\n",
355
    "        # targets = [{k: v.to(device) for k, v in t.items()} for t in targets]\n",
356
    "        device_target = {}\n",
357
    "        for k in ( 'boxes', 'labels', 'masks', 'image_id', 'area', 'iscrowd' ):\n",
358
    "            device_target[k] = targets[k].to(device)\n",
359
    "        targets = [{\n",
360
    "            'boxes': device_target['boxes'][0,:,:],\n",
361
    "            'labels': device_target['labels'][0,:],\n",
362
    "            'masks': device_target['masks'][0,:],\n",
363
    "            'image_id': device_target['image_id'][0,:],\n",
364
    "            'area': device_target['area'][0,:],\n",
365
    "            'iscrowd': device_target['iscrowd'][0,:],\n",
366
    "        }]\n",
367
    "        #\n",
368
    "        with torch.cuda.amp.autocast(enabled=scaler is not None):\n",
369
    "            loss_dict = model(images, targets)\n",
370
    "            losses = sum(loss for loss in loss_dict.values())\n",
371
    "\n",
372
    "        loss_value = losses.item()\n",
373
    "        acc_loss_value += loss_value\n",
374
    "        acc_loss_classifier += loss_dict['loss_classifier'].item()\n",
375
    "        acc_loss_box_reg += loss_dict['loss_box_reg'].item()\n",
376
    "        acc_loss_mask += loss_dict['loss_mask'].item()\n",
377
    "        acc_loss_objectness += loss_dict['loss_objectness'].item()\n",
378
    "        acc_loss_rpn_box_reg += loss_dict['loss_rpn_box_reg'].item()\n",
379
    "\n",
380
    "        if not math.isfinite(loss_value):\n",
381
    "            print(f\"Loss is {loss_value}, stopping training\")\n",
382
    "            sys.exit(1)\n",
383
    "\n",
384
    "        optimizer.zero_grad()\n",
385
    "        if scaler is not None:\n",
386
    "            scaler.scale(losses).backward()\n",
387
    "            scaler.step(optimizer)\n",
388
    "            scaler.update()\n",
389
    "        else:\n",
390
    "            losses.backward()\n",
391
    "            optimizer.step()\n",
392
    "        if lr_scheduler is not None:\n",
393
    "            lr_scheduler.step()\n",
394
    "\n",
395
    "    pbar.close()\n",
396
    "\n",
397
    "    current_lr = optimizer.param_groups[0][\"lr\"]\n",
398
    "    lr_scheduler_global.step()\n",
399
    "\n",
400
    "    model.eval()\n",
401
    "    pbar = tqdm(total=len(val_loader.dataset))\n",
402
    "    # iou_types = \"segm\"  # \"segm\", \"bbox\", \"keypoints\"\n",
403
    "    segmPredicted = []\n",
404
    "    bboxPredicted = []\n",
405
    "    for images, targets in val_loader:\n",
406
    "        pbar.update(len(images))\n",
407
    "        images = list(img.to(device) for img in images)\n",
408
    "        if torch.cuda.is_available():\n",
409
    "            torch.cuda.synchronize()\n",
410
    "        predictions = model(images)\n",
411
    "        accumulate_metrics(segmPredicted, bboxPredicted, targets, predictions)\n",
412
    "\n",
413
    "    metrics = compute_metrics(val_loader.dataset.coco, segmPredicted, bboxPredicted)\n",
414
    "    pbar.close()\n",
415
    "\n",
416
    "    print(\"Epoch: {0}; lr={1:.4f}; loss={2:.4f}; loss_mask={3:.4f}; cls={4:.4f}; box={5:.4f}; segm.mAP={6:.3f}; bbox.mAP={6:.3f}\"\n",
417
    "          .format(epoch, current_lr, acc_loss_value, acc_loss_mask, acc_loss_classifier, acc_loss_box_reg, metrics['segm']['mAP'], metrics['bbox']['mAP']))\n",
418
    "\n",
419
    "    metrics_summary = {\n",
420
    "        'current_lr': current_lr,\n",
421
    "        'acc_loss_value': acc_loss_value,\n",
422
    "        'acc_loss_mask': acc_loss_mask,\n",
423
    "        'acc_loss_classifier': acc_loss_classifier,\n",
424
    "        'acc_loss_box_reg': acc_loss_box_reg,\n",
425
    "        'acc_loss_objectness': acc_loss_objectness,\n",
426
    "        'acc_loss_rpn_box_reg': acc_loss_rpn_box_reg,\n",
427
    "    }\n",
428
    "    for mtype in ('segm', 'bbox'):\n",
429
    "        for metric_key in coco_metric_names.keys():\n",
430
    "            metrics_summary[mtype+'.'+metric_key] = metrics[mtype][metric_key]\n",
431
    "    # mlflow.log_metrics(metrics_summary)\n",
432
    "\n",
433
    "#     mlflow.pytorch.log_state_dict(artifact_path=\"checkpoint\", state_dict={\n",
434
    "#         \"model\": model.state_dict(),\n",
435
    "#         \"optimizer\": optimizer.state_dict(),\n",
436
    "#         \"epoch\": epoch,\n",
437
    "#         \"best_accuracy\": metrics['segm']['mAP']\n",
438
    "#     })\n",
439
    "    if metrics['segm']['mAP'] > best_accuracy:\n",
440
    "        best_accuracy = metrics['segm']['mAP']\n",
441
    "        best_state = copy.deepcopy(model.state_dict())\n",
442
    "        "
443
   ]
444
  },
445
  {
446
   "cell_type": "code",
447
   "execution_count": null,
448
   "id": "a926cacf",
449
   "metadata": {
450
    "cellId": "74qvm7e2emcpjd0k860d6"
451
   },
452
   "outputs": [],
453
   "source": []
454
  }
455
 ],
456
 "metadata": {
457
  "language_info": {
458
   "codemirror_mode": {
459
    "name": "ipython",
460
    "version": 3
461
   },
462
   "file_extension": ".py",
463
   "mimetype": "text/x-python",
464
   "name": "python",
465
   "nbconvert_exporter": "python",
466
   "pygments_lexer": "ipython3",
467
   "version": "3.7.7"
468
  },
469
  "notebookId": "f1223451-1595-404f-8bde-8aa2570af0fe",
470
  "notebookPath": "demo-ml-pennfudanped/train_model.ipynb"
471
 },
472
 "nbformat": 4,
473
 "nbformat_minor": 5
474
}
475
demo-ml-pennfudanped

Использование cookies