tifffile

Форк
0
/
earthbigdata.py 
556 строк · 14.2 Кб
1
# tifffile/examples/earthbigdata.py
2

3
# Copyright (c) 2021-2024, Christoph Gohlke
4
# All rights reserved.
5
#
6
# Redistribution and use in source and binary forms, with or without
7
# modification, are permitted provided that the following conditions are met:
8
#
9
# 1. Redistributions of source code must retain the above copyright notice,
10
#    this list of conditions and the following disclaimer.
11
#
12
# 2. Redistributions in binary form must reproduce the above copyright notice,
13
#    this list of conditions and the following disclaimer in the documentation
14
#    and/or other materials provided with the distribution.
15
#
16
# 3. Neither the name of the copyright holder nor the names of its
17
#    contributors may be used to endorse or promote products derived from
18
#    this software without specific prior written permission.
19
#
20
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
# POSSIBILITY OF SUCH DAMAGE.
31

32
# This file uses VSCode Jupyter-like code cells
33
# https://code.visualstudio.com/docs/python/jupyter-support-py
34

35
# %% [markdown]
36
"""
37
# Create a fsspec ReferenceFileSystem for a large set of remote GeoTIFF files
38

39
by [Christoph Gohlke](https://www.cgohlke.com)
40

41
Updated on May 22, 2024
42

43
This Python script uses the [tifffile](https://github.com/cgohlke/tifffile) and
44
[imagecodecs](https://github.com/cgohlke/imagecodecs) packages to create a
45
[fsspec ReferenceFileSystem](https://github.com/fsspec/kerchunk) file in
46
JSON format for the [Earthbigdata](
47
http://sentinel-1-global-coherence-earthbigdata.s3-website-us-west-2.amazonaws.com
48
) set, which consists of 1,033,422 GeoTIFF files stored on AWS.
49
The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.
50

51
Refer to the discussion at [kerchunk/issues/78](
52
https://github.com/fsspec/kerchunk/issues/78).
53
"""
54

55
# %%
56
import base64
57
import os
58

59
import fsspec
60
import imagecodecs.numcodecs
61
import matplotlib.pyplot
62
import numcodecs
63
import tifffile
64
import xarray
65
import zarr
66

67
# %% [markdown]
68
"""
69
## Get a list of all remote TIFF files
70

71
Call the AWS command line app to recursively list all files in the Earthbigdata
72
set. Cache the output in a local file. Filter the list for TIFF files and
73
remove the common path.
74
"""
75

76
# %%
77
if not os.path.exists('earthbigdata.txt'):
78
    os.system(
79
        'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
80
        ' --recursive > earthbigdata.txt'
81
    )
82

83
with open('earthbigdata.txt', encoding='utf-8') as fh:
84
    tiff_files = [
85
        line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
86
    ]
87
print('Number of TIFF files:', len(tiff_files))
88

89

90
# %% [markdown]
91
"""
92
## Define metadata to describe the dataset
93

94
Define labels, coordinate arrays, file name regular expression patterns, and
95
categories for all dimensions in the Earthbigdata set.
96
"""
97

98
# %%
99
baseurl = (
100
    'https://'
101
    'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
102
    '/data/tiles/'
103
)
104

105
chunkshape = (1200, 1200)
106
fillvalue = 0
107

108
latitude_label = 'latitude'
109
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
110
latitude_coordinates = [
111
    (j * -0.00083333333 - 0.000416666665 + i)
112
    for i in range(82, -79, -1)
113
    for j in range(1200)
114
]
115
latitude_category = {}
116
i = 0
117
for j in range(82, -1, -1):
118
    latitude_category[f'N{j:-02}'] = i
119
    i += 1
120
for j in range(1, 79):
121
    latitude_category[f'S{j:-02}'] = i
122
    i += 1
123

124
longitude_label = 'longitude'
125
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
126
longitude_coordinates = [
127
    (j * 0.00083333333 + 0.000416666665 + i)
128
    for i in range(-180, 180)
129
    for j in range(1200)
130
]
131
longitude_category = {}
132
i = 0
133
for j in range(180, 0, -1):
134
    longitude_category[f'W{j:-03}'] = i
135
    i += 1
136
for j in range(180):
137
    longitude_category[f'E{j:-03}'] = i
138
    i += 1
139

140
season_label = 'season'
141
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
142
season_coordinates = list(season_category.keys())
143
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
144

145
polarization_label = 'polarization'
146
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
147
polarization_coordinates = list(polarization_category.keys())
148
polarization_pattern = (
149
    rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
150
)
151

152
coherence_label = 'coherence'
153
coherence_category = {
154
    '06': 0,
155
    '12': 1,
156
    '18': 2,
157
    '24': 3,
158
    '36': 4,
159
    '48': 5,
160
}
161
coherence_coordinates = list(int(i) for i in coherence_category.keys())
162
coherence_pattern = (
163
    rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
164
)
165

166
orbit_label = 'orbit'
167
orbit_coordinates = list(range(1, 176))
168
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
169

170
flightdirection_label = 'flightdirection'
171
flightdirection_category = {'A': 0, 'D': 1}
172
flightdirection_coordinates = list(flightdirection_category.keys())
173
flightdirection_pattern = (
174
    rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
175
)
176

177

178
# %% [markdown]
179
"""
180
## Open a file for writing the fsspec ReferenceFileSystem in JSON format
181
"""
182

183
# %%
184
jsonfile = open('earthbigdata.json', 'w', encoding='utf-8', newline='\n')
185

186
# %% [markdown]
187
"""
188
## Write the coordinate arrays
189

190
Add the coordinate arrays to a Zarr group, convert it to a fsspec
191
ReferenceFileSystem JSON string, and write it to the open file.
192
"""
193

194
# %%
195
coordinates = {}  # type: ignore
196
zarrgroup = zarr.open_group(coordinates)
197
zarrgroup.array(
198
    longitude_label,
199
    data=longitude_coordinates,
200
    dtype='float32',
201
    # compression='zlib',
202
).attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
203

204
zarrgroup.array(
205
    latitude_label,
206
    data=latitude_coordinates,
207
    dtype='float32',
208
    # compression='zlib',
209
).attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
210

211
zarrgroup.array(
212
    season_label,
213
    data=season_coordinates,
214
    dtype=object,
215
    object_codec=numcodecs.VLenUTF8(),
216
    compression=None,
217
).attrs['_ARRAY_DIMENSIONS'] = [season_label]
218

219
zarrgroup.array(
220
    polarization_label,
221
    data=polarization_coordinates,
222
    dtype=object,
223
    object_codec=numcodecs.VLenUTF8(),
224
    compression=None,
225
).attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
226

227
zarrgroup.array(
228
    coherence_label,
229
    data=coherence_coordinates,
230
    dtype='uint8',
231
    compression=None,
232
).attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
233

234
zarrgroup.array(orbit_label, data=orbit_coordinates, dtype='int32').attrs[
235
    '_ARRAY_DIMENSIONS'
236
] = [orbit_label]
237

238
zarrgroup.array(
239
    flightdirection_label,
240
    data=flightdirection_coordinates,
241
    dtype=object,
242
    object_codec=numcodecs.VLenUTF8(),
243
    compression=None,
244
).attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
245

246
# base64 encode any values containing non-ascii characters
247
for k, v in coordinates.items():
248
    try:
249
        coordinates[k] = v.decode()
250
    except UnicodeDecodeError:
251
        coordinates[k] = 'base64:' + base64.b64encode(v).decode()
252

253
coordinates_json = tifffile.ZarrStore._json(coordinates).decode()
254

255
jsonfile.write(coordinates_json[:-2])  # skip the last newline and brace
256

257
# %% [markdown]
258
"""
259
## Create a TiffSequence from a list of file names
260

261
Filter the list of GeoTIFF files for files containing coherence 'COH' data.
262
The regular expression pattern and categories are used to parse the file names
263
for chunk indices.
264

265
Note: the created TiffSequence cannot be used to access any files. The file
266
names do not refer to existing files. The `baseurl` is later used to get
267
the real location of the files.
268
"""
269

270
# %%
271
mode = 'COH'
272
fileseq = tifffile.TiffSequence(
273
    [file for file in tiff_files if '_' + mode in file],
274
    pattern=(
275
        latitude_pattern
276
        + longitude_pattern
277
        + season_pattern
278
        + polarization_pattern
279
        + coherence_pattern
280
    ),
281
    categories={
282
        latitude_label: latitude_category,
283
        longitude_label: longitude_category,
284
        season_label: season_category,
285
        polarization_label: polarization_category,
286
        coherence_label: coherence_category,
287
    },
288
)
289
assert len(fileseq) == 444821
290
assert fileseq.files_missing == 5119339
291
assert fileseq.shape == (161, 360, 4, 4, 6)
292
assert fileseq.dims == (
293
    'latitude',
294
    'longitude',
295
    'season',
296
    'polarization',
297
    'coherence',
298
)
299
print(fileseq)
300

301

302
# %% [markdown]
303
"""
304
## Create a ZarrTiffStore from the TiffSequence
305

306
Define `axestiled` to tile the latitude and longitude dimensions of the
307
TiffSequence with the first and second image/chunk dimensions.
308
Define extra `zattrs` to create a Xarray compatible store.
309
"""
310

311
# %%
312
store = fileseq.aszarr(
313
    chunkdtype='uint8',
314
    chunkshape=chunkshape,
315
    fillvalue=fillvalue,
316
    axestiled={0: 0, 1: 1},
317
    zattrs={
318
        '_ARRAY_DIMENSIONS': [
319
            season_label,
320
            polarization_label,
321
            coherence_label,
322
            latitude_label,
323
            longitude_label,
324
        ]
325
    },
326
)
327
print(store)
328

329
# %% [markdown]
330
"""
331
## Append the ZarrTiffStore to the open ReferenceFileSystem file
332

333
Use the mode name to create a Zarr subgroup.
334
Use the `imagecodecs_tiff` Numcodecs compatible codec for decoding TIFF files.
335
"""
336

337
# %%
338
store.write_fsspec(
339
    jsonfile,
340
    baseurl,
341
    groupname=mode,
342
    codec_id='imagecodecs_tiff',
343
    _append=True,
344
    _close=False,
345
)
346

347
# %% [markdown]
348
"""
349
## Repeat for the other modes
350

351
Repeat the `TiffSequence->aszarr->write_fsspec` workflow for the other modes.
352
"""
353

354
# %%
355
for mode in (
356
    'AMP',
357
    'tau',
358
    'rmse',
359
    'rho',
360
):
361
    fileseq = tifffile.TiffSequence(
362
        [file for file in tiff_files if '_' + mode in file],
363
        pattern=(
364
            latitude_pattern
365
            + longitude_pattern
366
            + season_pattern
367
            + polarization_pattern
368
        ),
369
        categories={
370
            latitude_label: latitude_category,
371
            longitude_label: longitude_category,
372
            season_label: season_category,
373
            polarization_label: polarization_category,
374
        },
375
    )
376
    print(fileseq)
377
    with fileseq.aszarr(
378
        chunkdtype='uint16',
379
        chunkshape=chunkshape,
380
        fillvalue=fillvalue,
381
        axestiled={0: 0, 1: 1},
382
        zattrs={
383
            '_ARRAY_DIMENSIONS': [
384
                season_label,
385
                polarization_label,
386
                latitude_label,
387
                longitude_label,
388
            ]
389
        },
390
    ) as store:
391
        print(store)
392
        store.write_fsspec(
393
            jsonfile,
394
            baseurl,
395
            groupname=mode,
396
            codec_id='imagecodecs_tiff',
397
            _append=True,
398
            _close=False,
399
        )
400

401

402
for mode in ('inc', 'lsmap'):
403
    fileseq = tifffile.TiffSequence(
404
        [file for file in tiff_files if '_' + mode in file],
405
        pattern=(
406
            latitude_pattern
407
            + longitude_pattern
408
            + orbit_pattern
409
            + flightdirection_pattern
410
        ),
411
        categories={
412
            latitude_label: latitude_category,
413
            longitude_label: longitude_category,
414
            # orbit has no category
415
            flightdirection_label: flightdirection_category,
416
        },
417
    )
418
    print(fileseq)
419
    with fileseq.aszarr(
420
        chunkdtype='uint8',
421
        chunkshape=chunkshape,
422
        fillvalue=fillvalue,
423
        axestiled={0: 0, 1: 1},
424
        zattrs={
425
            '_ARRAY_DIMENSIONS': [
426
                orbit_label,
427
                flightdirection_label,
428
                latitude_label,
429
                longitude_label,
430
            ]
431
        },
432
    ) as store:
433
        print(store)
434
        store.write_fsspec(
435
            jsonfile,
436
            baseurl,
437
            groupname=mode,
438
            codec_id='imagecodecs_tiff',
439
            _append=True,
440
            _close=mode == 'lsmap',  # close after last store
441
        )
442

443
# %% [markdown]
444
"""
445
## Close the JSON file
446
"""
447

448
# %%
449
jsonfile.close()
450

451
# %% [markdown]
452
"""
453
## Use the fsspec ReferenceFileSystem file to create a Xarray dataset
454

455
Register `imagecodecs.numcodecs` before using the ReferenceFileSystem.
456
"""
457

458
# %%
459
imagecodecs.numcodecs.register_codecs()
460

461
# %% [markdown]
462
"""
463
### Create a fsspec mapper instance from the ReferenceFileSystem file
464

465
Specify the `target_protocol` to load a local file.
466
"""
467

468
# %%
469
mapper = fsspec.get_mapper(
470
    'reference://',
471
    fo='earthbigdata.json',
472
    target_protocol='file',
473
    remote_protocol='https',
474
)
475

476
# %% [markdown]
477
"""
478
### Create a Xarray dataset from the mapper
479

480
Use `mask_and_scale` to disable conversion to floating point.
481
"""
482

483
# %%
484
dataset = xarray.open_dataset(
485
    mapper,
486
    engine='zarr',
487
    mask_and_scale=False,
488
    backend_kwargs={'consolidated': False},
489
)
490
print(dataset)
491

492
# %% [markdown]
493
"""
494
### Select the Southern California region in the dataset
495
"""
496

497
# %%
498
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
499
print(socal)
500

501
# %% [markdown]
502
"""
503
### Plot a selection of the dataset
504

505
The few GeoTIFF files comprising the selection are transparently downloaded,
506
decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.
507
"""
508

509
# %%
510
image = socal['COH'].loc['winter', 'vv', 12]
511
assert image[100, 100] == 53
512
xarray.plot.imshow(image, size=6, aspect=1.8)
513
matplotlib.pyplot.show()
514

515
# %% [markdown]
516
"""
517
## System information
518

519
Print information about the software used to run this script.
520
"""
521

522

523
# %%
524
def system_info():
525
    import datetime
526
    import sys
527

528
    import fsspec
529
    import imagecodecs
530
    import matplotlib
531
    import numcodecs
532
    import numpy
533
    import tifffile
534
    import xarray
535
    import zarr
536

537
    return '\n'.join(
538
        (
539
            sys.executable,
540
            f'Python {sys.version}',
541
            '',
542
            f'numpy {numpy.__version__}',
543
            f'matplotlib {matplotlib.__version__}',
544
            f'tifffile {tifffile.__version__}',
545
            f'imagecodecs {imagecodecs.__version__}',
546
            f'numcodecs {numcodecs.__version__}',
547
            f'fsspec {fsspec.__version__}',
548
            f'xarray {xarray.__version__}',
549
            f'zarr {zarr.__version__}',
550
            '',
551
            str(datetime.datetime.now()),
552
        )
553
    )
554

555

556
print(system_info())
557

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.