reposync

yum.py
379 строк · 12.4 Кб
Перенос по словам
1
#
2
# RepoSync.Provider.YUM
3
#
4
# Copyright (c) 2023-2024 Владислав Щапов aka Vladislav Shchapov <vladislav@shchapov.ru>
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
#
18

19

20
import hashlib
21
import json
22
import os
23
import os.path
24
import shutil
25
import urllib3
26
import xml.dom.minidom
27

28
from dataclasses import dataclass
29

30
from provider.base import repo_interface
31
import provider.utility as utility
32

33

34

35
@dataclass
36
class RepoEntity:
37
    http_code: int
38
    hexdigest: str
39
    content: bytearray
40

41

42
class Checksum:
43
    def __name_map(self, name):
44
        if name == "sha":
45
            return "sha1"
46
        else:
47
            return name
48

49
    def __init__(self, name, hexdigest):
50
        self.name      = self.__name_map(name)
51
        self.hexdigest = hexdigest
52

53

54
class repo(repo_interface):
55
    #
56
    #
57
    #
58
    def __init__(self, url, dir, opts):
59
        super().__init__("yum", url, dir, opts)
60

61
        self.__http = None
62

63
        # Словарь всех файлов в репозитории на диске. Ключ - относительный путь, значение - os.DirEntry
64
        self.__tree_before = dict()
65

66
        # Список файлов для переименования из временного каталога в рабочий
67
        # Порядок переименований - обратный, относительно списка.
68
        self.__rename_list = list()
69

70
        # Множество файлов, которые прошли в загрузку.
71
        # Их удалять не надо.
72
        self.__processed_set = set()
73

74
        # Список файлов со сломанными чексуммами.
75
        self.__checksum_mismatch = list()
76

77

78
    # --------
79
    #
80
    #
81
    #
82
    #def check(self):
83
    #    # Не реализовано
84
    #    print("yum check")
85
    #    self.__mirror(False)
86
    # --------
87

88

89
    #
90
    #
91
    #
92
    def mirror(self):
93
        self.__mirror(True)
94

95

96
    #
97
    #
98
    #
99
    def __scanrepo(self):
100
        dir = self.dir()
101
        tmp = self.tmpdir()
102

103
        if os.path.isdir(dir):
104
            for entry in utility.scantree(dir):
105
                if entry.is_dir(follow_symlinks=False):
106
                    continue
107

108
                if os.path.commonprefix([entry.path, tmp]) == tmp:
109
                    # Временный каталог
110
                    continue
111

112
                relpath = os.path.relpath(entry.path, dir)
113
                self.__tree_before[relpath] = entry
114

115
                #print(entry.path)
116
                #print(relpath)
117

118
        elif os.path.exists(dir):
119
            raise NotADirectoryError(f"Repository path '{dir}' is not a directory.")
120

121

122
    #
123
    #
124
    #
125
    def __mirror(self, write):
126
        # write - Флаг, показывающий, записывать ли изменения на диск или только показать их.
127
        # print(write)
128

129
        self.__scanrepo()
130
        self.__init_http()
131

132
        #  https://docs.python.org/3/library/os.html#os.DirEntry
133
        #for k, v in self.__tree_before.items():
134
        #    print(k)
135
        #    #print(v.path)
136
        #    s = v.stat(follow_symlinks=False)
137
        #    print(s.st_mtime)
138

139
        # https://docs.python.org/3/library/hashlib.html
140

141
        self.__process_extra_files_json()
142
        self.__process_repomd()
143

144
        #content = utility.unarchiving(path, content)
145
        #print(data.http_code)
146
        #print(data.hexdigest)
147
        #print(data.content)
148
        #print(hashlib.algorithms_available)
149

150

151
        # Переименование файлов и формирование списка на удаление путем удаления файлов из полного списка файлов
152
        for path in reversed(self.__rename_list):
153
            self.log(0, "Update {}".format(path))
154
            srcpath = os.path.join(self.tmpdir(), path)
155
            dstpath = os.path.join(self.dir()   , path)
156
            os.makedirs(os.path.dirname(dstpath), exist_ok=True)
157
            os.replace(srcpath, dstpath)
158

159

160
        # Удаление остатков
161
        for path in self.__tree_before:
162
            if not path in self.__processed_set:
163
                if self.delete_extraneous():
164
                    self.log(0, "Remove {}".format(path))
165
                    dstpath = os.path.join(self.dir(), path)
166
                    os.remove(dstpath)
167
                else:
168
                    self.log(0, "Need remove {}".format(path))
169

170
        # В КОНЦЕ удалить временный каталог
171
        shutil.rmtree( self.tmpdir() )
172

173
        # Почистить пустые директории
174
        utility.delete_empty_folders( self.dir() )
175

176

177

178
    #
179
    #
180
    #
181
    def __process_repodata_primary(self, content):
182
        dom = xml.dom.minidom.parseString(content)
183
        dom.normalize()
184
        packages = dom.getElementsByTagName("package")
185

186
        for package in packages:
187
            package_type = package.getAttribute("type")
188
            # Expected rpm
189

190
            checksum_node = package.getElementsByTagName("checksum")[0]
191
            location_node = package.getElementsByTagName("location")[0]
192

193
            path     = location_node.getAttribute("href")
194
            checksum = Checksum(checksum_node.getAttribute("type"), checksum_node.firstChild.nodeValue)
195

196
            self.__download( path, checksum = checksum )
197

198

199
    #
200
    #
201
    #
202
    def __process_repomd(self):
203

204
        self.__download('repodata/repomd.xml.asc')
205
        self.__download('repodata/repomd.xml.key')
206

207
        repomd = self.__download(
208
            'repodata/repomd.xml',
209
            read = True
210
        )
211

212
        dom = xml.dom.minidom.parseString(repomd.content)
213
        dom.normalize()
214
        datas = dom.getElementsByTagName("data")
215

216
        for data in datas:
217
            data_type = data.getAttribute("type")
218
            checksum_node = data.getElementsByTagName("checksum")[0]
219
            location_node = data.getElementsByTagName("location")[0]
220

221
            path = location_node.getAttribute("href")
222
            checksum = Checksum(checksum_node.getAttribute("type"), checksum_node.firstChild.nodeValue)
223

224
            if data_type == 'primary':
225
                primary = self.__download(
226
                    path,
227
                    read = True,
228
                    checksum = checksum
229
                )
230
                self.__process_repodata_primary( utility.unarchiving(path, primary.content) )
231
            else:
232
                self.__download( path, checksum = checksum )
233

234

235
    #
236
    # https://docs.pagure.org/pungi/configuration.html#extra-files-metadata
237
    #
238
    def __process_extra_files_json(self):
239
        extra_files = self.__download(
240
            'extra_files.json',
241
            read = True
242
        )
243

244
        if extra_files.http_code != 200:
245
            return
246

247
        j = json.loads(extra_files.content)
248
        for entity in j.get('data', list()):
249
            path     = None
250
            checksum = None
251
            if 'file' in entity:
252
                path = entity['file']
253
                checksums = entity.get('checksums', dict())
254
                for name in checksums:
255
                    if name in hashlib.algorithms_available:
256
                        checksum = Checksum(name, checksums[name])
257
                        break;
258
                self.__download(
259
                    path,
260
                    checksum = checksum
261
                )
262

263

264
    #
265
    #
266
    #
267
    def __init_http(self):
268
        default_headers = {
269
            "User-Agent": "libdnf (AlmaLinux 8.8; generic; Linux.x86_64)"
270
        }
271
        self.__http = urllib3.PoolManager(
272
            headers = default_headers
273
        );
274

275

276
    #
277
    #
278
    #
279
    def __download(self, path, read: bool = False, checksum: Checksum = None) -> RepoEntity:
280
        url     = utility.url_concat(self.url(), path)
281
        srcpath = os.path.join(self.dir(), path)
282
        dstpath = os.path.join(self.tmpdir(), path)
283

284
        self.log(0, "Processing {}".format(url))
285

286
        # Что делаем?
287
        # Если есть хеш, то проверяем хеш конечного файла.
288
        #     Если не сходится, то проверяем хеш целевого файла и может быть сразу в список на переименование.
289
        # Если не сошлось или нет хеша, то перекачиваем без всяких проверок на время модификации.
290

291
        meta = self.__tree_before.get(path)
292
        if checksum is not None:
293
            if meta is not None:
294
                info = utility.read_or_checksum(srcpath, read, checksum.name)
295
                if info[0] == checksum.hexdigest:
296
                    self.__processed_set.add(path)
297
                    return RepoEntity(200, info[0], info[1])
298

299
            if os.path.isfile(dstpath):
300
                info = utility.read_or_checksum(dstpath, read, checksum.name)
301
                if info[0] == checksum.hexdigest:
302
                    self.__processed_set.add(path)
303
                    self.__rename_list.append(path)
304
                    return RepoEntity(200, info[0], info[1])
305

306
        os.makedirs(os.path.dirname(dstpath), exist_ok=True)
307

308
        headers = {}
309

310
        #
311
        # Подумать, может быть перекачивать с проверкой на наличие старого файла?
312
        #   Тогда читать придется только один раз.
313
        # И не надо будет смотреть на временный каталог...
314
        #
315

316
        if meta is not None:
317
            stat = meta.stat(follow_symlinks=False)
318
            headers["If-Modified-Since"] = utility.time_to_if_modified_since(stat.st_mtime)
319

320
        response = self.__http.request("GET", url, headers = headers, preload_content = False)
321

322
        try:
323
            http_code = response.status
324
            hexdigest = None
325
            content   = None
326

327
            if http_code == 200:
328
                file_hash = None
329
                if checksum is not None:
330
                    file_hash = hashlib.new(checksum.name)
331

332
                with open(dstpath, 'wb') as fd:
333
                    if read:
334
                        content = response.data
335
                        if checksum is not None:
336
                            file_hash.update(content)
337
                        fd.write(content)
338
                    else:
339
                        for chunk in response.stream(10485760):#10*1024*1024
340
                            if checksum is not None:
341
                                file_hash.update(chunk)
342
                            fd.write(chunk)
343

344
                if 'Last-Modified' in response.headers:
345
                    mtime = utility.if_modified_since_to_time(response.headers['Last-Modified'])
346
                    os.utime(dstpath, (mtime, mtime))
347

348
                if checksum is not None:
349
                    hexdigest = file_hash.hexdigest()
350

351
                    if hexdigest != checksum.hexdigest:
352
                        self.log(0, "Checksum mismatch for {}; Expected: {}; Actual: {}; Digest: {}".format(url, checksum.hexdigest, hexdigest, checksum.name))
353
                        self.__checksum_mismatch.append(path)
354

355
                self.__processed_set.add(path)
356
                self.__rename_list.append(path)
357
            elif http_code == 304:
358
                # Читаем
359
                if read or checksum:
360
                    info = utility.read_or_checksum(srcpath, read, checksum.name if checksum is not None else None)
361
                    hexdigest = info[0]
362
                    content   = info[1]
363

364
                http_code = 200
365
                if checksum is not None:
366
                    if hexdigest != checksum.hexdigest:
367
                        self.log(0, "Checksum mismatch for {}; Expected: {}; Actual: {}; Digest: {}".format(url, checksum.hexdigest, hexdigest, checksum.name))
368
                        self.__checksum_mismatch.append(path)
369

370
                self.__processed_set.add(path)
371
        finally:
372
            response.release_conn()
373

374
        return RepoEntity(http_code, hexdigest, content)
375

376

377
    #
378
    #
379
    #
380
reposync

Использование cookies