37
# Create a fsspec ReferenceFileSystem for a large set of remote GeoTIFF files
39
by [Christoph Gohlke](https://www.cgohlke.com)
41
Updated on May 22, 2024
43
This Python script uses the [tifffile](https://github.com/cgohlke/tifffile) and
44
[imagecodecs](https://github.com/cgohlke/imagecodecs) packages to create a
45
[fsspec ReferenceFileSystem](https://github.com/fsspec/kerchunk) file in
46
JSON format for the [Earthbigdata](
47
http://sentinel-1-global-coherence-earthbigdata.s3-website-us-west-2.amazonaws.com
48
) set, which consists of 1,033,422 GeoTIFF files stored on AWS.
49
The ReferenceFileSystem is used to create a multi-dimensional Xarray dataset.
51
Refer to the discussion at [kerchunk/issues/78](
52
https://github.com/fsspec/kerchunk/issues/78).
60
import imagecodecs.numcodecs
61
import matplotlib.pyplot
69
## Get a list of all remote TIFF files
71
Call the AWS command line app to recursively list all files in the Earthbigdata
72
set. Cache the output in a local file. Filter the list for TIFF files and
73
remove the common path.
77
if not os.path.exists('earthbigdata.txt'):
79
'aws s3 ls sentinel-1-global-coherence-earthbigdata/data/tiles'
80
' --recursive > earthbigdata.txt'
83
with open('earthbigdata.txt', encoding='utf-8') as fh:
85
line.split()[-1][11:] for line in fh.readlines() if '.tif' in line
87
print('Number of TIFF files:', len(tiff_files))
92
## Define metadata to describe the dataset
94
Define labels, coordinate arrays, file name regular expression patterns, and
95
categories for all dimensions in the Earthbigdata set.
101
'sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com'
105
chunkshape = (1200, 1200)
108
latitude_label = 'latitude'
109
latitude_pattern = rf'(?P<{latitude_label}>[NS]\d+)'
110
latitude_coordinates = [
111
(j * -0.00083333333 - 0.000416666665 + i)
112
for i in range(82, -79, -1)
115
latitude_category = {}
117
for j in range(82, -1, -1):
118
latitude_category[f'N{j:-02}'] = i
120
for j in range(1, 79):
121
latitude_category[f'S{j:-02}'] = i
124
longitude_label = 'longitude'
125
longitude_pattern = rf'(?P<{longitude_label}>[EW]\d+)'
126
longitude_coordinates = [
127
(j * 0.00083333333 + 0.000416666665 + i)
128
for i in range(-180, 180)
131
longitude_category = {}
133
for j in range(180, 0, -1):
134
longitude_category[f'W{j:-03}'] = i
137
longitude_category[f'E{j:-03}'] = i
140
season_label = 'season'
141
season_category = {'winter': 0, 'spring': 1, 'summer': 2, 'fall': 3}
142
season_coordinates = list(season_category.keys())
143
season_pattern = rf'_(?P<{season_label}>{"|".join(season_category)})'
145
polarization_label = 'polarization'
146
polarization_category = {'vv': 0, 'vh': 1, 'hv': 2, 'hh': 3}
147
polarization_coordinates = list(polarization_category.keys())
148
polarization_pattern = (
149
rf'_(?P<{polarization_label}>{"|".join(polarization_category)})'
152
coherence_label = 'coherence'
153
coherence_category = {
161
coherence_coordinates = list(int(i) for i in coherence_category.keys())
163
rf'_COH(?P<{coherence_label}>{"|".join(coherence_category)})'
167
orbit_coordinates = list(range(1, 176))
168
orbit_pattern = rf'_(?P<{orbit_label}>\d+)'
170
flightdirection_label = 'flightdirection'
171
flightdirection_category = {'A': 0, 'D': 1}
172
flightdirection_coordinates = list(flightdirection_category.keys())
173
flightdirection_pattern = (
174
rf'(?P<{flightdirection_label}>[{"|".join(flightdirection_category)}])_'
180
## Open a file for writing the fsspec ReferenceFileSystem in JSON format
184
jsonfile = open('earthbigdata.json', 'w', encoding='utf-8', newline='\n')
188
## Write the coordinate arrays
190
Add the coordinate arrays to a Zarr group, convert it to a fsspec
191
ReferenceFileSystem JSON string, and write it to the open file.
196
zarrgroup = zarr.open_group(coordinates)
199
data=longitude_coordinates,
202
).attrs['_ARRAY_DIMENSIONS'] = [longitude_label]
206
data=latitude_coordinates,
209
).attrs['_ARRAY_DIMENSIONS'] = [latitude_label]
213
data=season_coordinates,
215
object_codec=numcodecs.VLenUTF8(),
217
).attrs['_ARRAY_DIMENSIONS'] = [season_label]
221
data=polarization_coordinates,
223
object_codec=numcodecs.VLenUTF8(),
225
).attrs['_ARRAY_DIMENSIONS'] = [polarization_label]
229
data=coherence_coordinates,
232
).attrs['_ARRAY_DIMENSIONS'] = [coherence_label]
234
zarrgroup.array(orbit_label, data=orbit_coordinates, dtype='int32').attrs[
239
flightdirection_label,
240
data=flightdirection_coordinates,
242
object_codec=numcodecs.VLenUTF8(),
244
).attrs['_ARRAY_DIMENSIONS'] = [flightdirection_label]
247
for k, v in coordinates.items():
249
coordinates[k] = v.decode()
250
except UnicodeDecodeError:
251
coordinates[k] = 'base64:' + base64.b64encode(v).decode()
253
coordinates_json = tifffile.ZarrStore._json(coordinates).decode()
255
jsonfile.write(coordinates_json[:-2])
259
## Create a TiffSequence from a list of file names
261
Filter the list of GeoTIFF files for files containing coherence 'COH' data.
262
The regular expression pattern and categories are used to parse the file names
265
Note: the created TiffSequence cannot be used to access any files. The file
266
names do not refer to existing files. The `baseurl` is later used to get
267
the real location of the files.
272
fileseq = tifffile.TiffSequence(
273
[file for file in tiff_files if '_' + mode in file],
278
+ polarization_pattern
282
latitude_label: latitude_category,
283
longitude_label: longitude_category,
284
season_label: season_category,
285
polarization_label: polarization_category,
286
coherence_label: coherence_category,
289
assert len(fileseq) == 444821
290
assert fileseq.files_missing == 5119339
291
assert fileseq.shape == (161, 360, 4, 4, 6)
292
assert fileseq.dims == (
304
## Create a ZarrTiffStore from the TiffSequence
306
Define `axestiled` to tile the latitude and longitude dimensions of the
307
TiffSequence with the first and second image/chunk dimensions.
308
Define extra `zattrs` to create a Xarray compatible store.
312
store = fileseq.aszarr(
314
chunkshape=chunkshape,
316
axestiled={0: 0, 1: 1},
318
'_ARRAY_DIMENSIONS': [
331
## Append the ZarrTiffStore to the open ReferenceFileSystem file
333
Use the mode name to create a Zarr subgroup.
334
Use the `imagecodecs_tiff` Numcodecs compatible codec for decoding TIFF files.
342
codec_id='imagecodecs_tiff',
349
## Repeat for the other modes
351
Repeat the `TiffSequence->aszarr->write_fsspec` workflow for the other modes.
361
fileseq = tifffile.TiffSequence(
362
[file for file in tiff_files if '_' + mode in file],
367
+ polarization_pattern
370
latitude_label: latitude_category,
371
longitude_label: longitude_category,
372
season_label: season_category,
373
polarization_label: polarization_category,
379
chunkshape=chunkshape,
381
axestiled={0: 0, 1: 1},
383
'_ARRAY_DIMENSIONS': [
396
codec_id='imagecodecs_tiff',
402
for mode in ('inc', 'lsmap'):
403
fileseq = tifffile.TiffSequence(
404
[file for file in tiff_files if '_' + mode in file],
409
+ flightdirection_pattern
412
latitude_label: latitude_category,
413
longitude_label: longitude_category,
415
flightdirection_label: flightdirection_category,
421
chunkshape=chunkshape,
423
axestiled={0: 0, 1: 1},
425
'_ARRAY_DIMENSIONS': [
427
flightdirection_label,
438
codec_id='imagecodecs_tiff',
440
_close=mode == 'lsmap',
445
## Close the JSON file
453
## Use the fsspec ReferenceFileSystem file to create a Xarray dataset
455
Register `imagecodecs.numcodecs` before using the ReferenceFileSystem.
459
imagecodecs.numcodecs.register_codecs()
463
### Create a fsspec mapper instance from the ReferenceFileSystem file
465
Specify the `target_protocol` to load a local file.
469
mapper = fsspec.get_mapper(
471
fo='earthbigdata.json',
472
target_protocol='file',
473
remote_protocol='https',
478
### Create a Xarray dataset from the mapper
480
Use `mask_and_scale` to disable conversion to floating point.
484
dataset = xarray.open_dataset(
487
mask_and_scale=False,
488
backend_kwargs={'consolidated': False},
494
### Select the Southern California region in the dataset
498
socal = dataset.sel(latitude=slice(36, 32.5), longitude=slice(-121, -115))
503
### Plot a selection of the dataset
505
The few GeoTIFF files comprising the selection are transparently downloaded,
506
decoded, and stitched to an in-memory NumPy array and plotted using Matplotlib.
510
image = socal['COH'].loc['winter', 'vv', 12]
511
assert image[100, 100] == 53
512
xarray.plot.imshow(image, size=6, aspect=1.8)
513
matplotlib.pyplot.show()
519
Print information about the software used to run this script.
540
f'Python {sys.version}',
542
f'numpy {numpy.__version__}',
543
f'matplotlib {matplotlib.__version__}',
544
f'tifffile {tifffile.__version__}',
545
f'imagecodecs {imagecodecs.__version__}',
546
f'numcodecs {numcodecs.__version__}',
547
f'fsspec {fsspec.__version__}',
548
f'xarray {xarray.__version__}',
549
f'zarr {zarr.__version__}',
551
str(datetime.datetime.now()),