v

Зеркало из https://github.com/vlang/v
Форк
0
/
csv_reader_random_access.v 
550 строк · 14.6 Кб
1
/*
2
csv random access reader 1.0 alpha
3

4
Copyright (c) 2023 Dario Deledda. All rights reserved.
5
Use of this source code is governed by an MIT license
6
that can be found in the LICENSE file.
7

8
Known limitations:
9
- no stream reading
10
*/
11
module csv
12

13
import os
14

15
/******************************************************************************
16
*
17
* Consts
18
*
19
******************************************************************************/
20
// endline lengths
21
pub const endline_cr_len = 1
22
pub const endline_crlf_len = 2
23

24
// Type of read buffer
25
pub const ram_csv = 1
26
pub const file_csv = 0
27

28
/******************************************************************************
29
*
30
* Structs
31
*
32
******************************************************************************/
33
pub enum ColumType {
34
	string = 0
35
	int    = 1
36
	f32    = 2
37
}
38

39
pub struct HeaderItem {
40
pub mut:
41
	label  string
42
	column int
43
	htype  ColumType = .string
44
}
45

46
pub struct RandomAccessReader {
47
pub mut:
48
	index i64
49

50
	f              os.File
51
	f_len          i64
52
	is_bom_present bool
53

54
	start_index i64
55
	end_index   i64 = -1
56

57
	end_line      u8  = `\n`
58
	end_line_len  int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
59
	separator     u8  = `,`            // comma is the default separator
60
	separator_len int = 1              // size of the separator rune
61
	quote         u8  = `"`            // double quote is the standard quote char
62
	quote_remove  bool // if true clear the cell from the quotes
63
	comment       u8 = `#` // every line that start with the quote char is ignored
64

65
	default_cell string = '*' // return this string if out of the csv boundaries
66
	empty_cell   string = '#' // retunrn this if empty cell
67
	// ram buffer
68
	mem_buf_type  u32     // buffer type 0=File,1=RAM
69
	mem_buf       voidptr // buffer used to load chars from file
70
	mem_buf_size  i64     // size of the buffer
71
	mem_buf_start i64 = -1 // start index in the file of the read buffer
72
	mem_buf_end   i64 = -1 // end index in the file of the read buffer
73
	// csv map for quick access
74
	csv_map [][]i64
75
	// header
76
	header_row  int = -1 // row index of the header in the csv_map
77
	header_list []HeaderItem   // list of the header item
78
	header_map  map[string]int // map from header label to column index
79
}
80

81
@[params]
82
pub struct RandomAccessReaderConfig {
83
pub:
84
	scr_buf      voidptr // pointer to the buffer of data
85
	scr_buf_len  i64     // if > 0 use the RAM pointed from scr_buf as source of data
86
	file_path    string
87
	start_index  i64
88
	end_index    i64    = -1
89
	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
90
	separator    u8     = `,`
91
	comment      u8     = `#` // every line that start with the quote char is ignored
92
	default_cell string = '*' // return this string if out of the csv boundaries
93
	empty_cell   string // return this string if empty cell
94
	end_line_len int = endline_cr_len // size of the endline rune
95
	quote        u8  = `"`            // double quote is the standard quote char
96
	quote_remove bool // if true clear the cell from the quotes
97
}
98

99
/******************************************************************************
100
*
101
* Init, dispose, fill buffer
102
*
103
******************************************************************************/
104

105
// csv_reader_from_string create a csv reader from a string
106
pub fn csv_reader_from_string(in_str string) !&RandomAccessReader {
107
	return csv_reader(RandomAccessReaderConfig{ scr_buf: in_str.str, scr_buf_len: in_str.len })!
108
}
109

110
// csv_reader create a random access csv reader
111
pub fn csv_reader(cfg RandomAccessReaderConfig) !&RandomAccessReader {
112
	mut cr := &RandomAccessReader{}
113

114
	cr.start_index = cfg.start_index
115
	cr.end_index = cfg.end_index
116

117
	if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
118
		cr.mem_buf_type = ram_csv // RAM buffer
119
		cr.mem_buf = cfg.scr_buf
120
		cr.mem_buf_size = cfg.scr_buf_len
121
		if cfg.end_index == -1 {
122
			cr.end_index = cfg.scr_buf_len
123
		}
124

125
		// check if BOM header is in the memory buffer
126
		unsafe {
127
			if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
128
				&& *(&u8(cr.mem_buf) + 2) == 0xBF {
129
				cr.is_bom_present = true
130
				cr.index += 3 // skip the BOM
131
			}
132
		}
133
	}
134
	// check if is a file source
135
	else if cfg.file_path.len > 0 {
136
		if !os.exists(cfg.file_path) {
137
			return error('ERROR: file ${cfg.file_path} not found!')
138
		}
139
		cr.mem_buf_type = file_csv // File buffer
140
		// allocate the memory
141
		unsafe {
142
			cr.mem_buf = malloc(cfg.mem_buf_size)
143
			cr.mem_buf_size = cfg.mem_buf_size
144
		}
145
		cr.f = os.open_file(cfg.file_path, 'rb')!
146

147
		cr.f.seek(0, .end)!
148
		cr.f_len = cr.f.tell()!
149

150
		cr.f.seek(cfg.start_index, .start)!
151
		cr.index = cr.f.tell()!
152

153
		if cfg.end_index == -1 {
154
			cr.end_index = cr.f_len
155
		}
156

157
		// check if BOM header is in the file
158
		if cr.index == 0 {
159
			if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
160
				unsafe {
161
					if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
162
						&& *(&u8(cr.mem_buf) + 2) == 0xBF {
163
						cr.is_bom_present = true
164
						cr.index += 3 // skip the BOM
165
					}
166
				}
167
			}
168
			cr.f.seek(cfg.start_index, .start)!
169
		}
170
	}
171

172
	cr.default_cell = cfg.default_cell
173
	cr.empty_cell = cfg.empty_cell
174
	cr.end_line_len = cfg.end_line_len
175
	cr.separator = cfg.separator
176
	cr.comment = cfg.comment
177
	cr.quote_remove = cfg.quote_remove
178
	cr.quote = cfg.quote
179

180
	cr.map_csv()!
181

182
	return cr
183
}
184

185
// dispose_csv_reader release the resources used by the csv_reader
186
pub fn (mut cr RandomAccessReader) dispose_csv_reader() {
187
	if cr.mem_buf_type == ram_csv {
188
		// do nothing, ram buffer is static
189
	} else if cr.mem_buf_type == file_csv {
190
		// file close
191
		if cr.f.is_opened {
192
			cr.f.close()
193
		}
194

195
		// free the allocated memory
196
		if cr.mem_buf_size > 0 {
197
			unsafe {
198
				free(cr.mem_buf)
199
			}
200
			cr.mem_buf = unsafe { nil }
201
			cr.mem_buf_size = 0
202
		}
203
	}
204
}
205

206
fn (mut cr RandomAccessReader) fill_buffer(i i64) !i64 {
207
	// use ram
208
	if cr.mem_buf_type == ram_csv {
209
		// do nothing, ram buffer are static for now
210
		cr.mem_buf_start = i
211
		cr.mem_buf_end = cr.mem_buf_size
212
		read_bytes_count := cr.mem_buf_end - cr.mem_buf_start
213
		// println("fill_buffer RAM: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
214
		return i64(read_bytes_count)
215
		// use file
216
	} else if cr.mem_buf_type == file_csv {
217
		cr.start_index = i
218
		cr.f.seek(cr.start_index, .start)!
219
		// IMPORTANT: add 64 bit support in vlib!!
220
		read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
221
		cr.mem_buf_start = i
222
		cr.mem_buf_end = i + read_bytes_count
223
		// println("fill_buffer FILE: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
224
		return i64(read_bytes_count)
225
	}
226
	return i64(-1)
227
}
228

229
/******************************************************************************
230
*
231
* Csv mapper, mapped reader
232
*
233
******************************************************************************/
234
// map_csv create an index of whole csv file to consent random access to every cell in the file
235
pub fn (mut cr RandomAccessReader) map_csv() ! {
236
	mut count := 0
237
	mut i := i64(0)
238
	mut capture_flag := true
239
	mut drop_row := false
240
	mut quote_flag := false // true if we are parsing inside a quote
241

242
	// if File return to the start of the file
243
	if cr.mem_buf_type == file_csv {
244
		cr.f.seek(cr.start_index, .start)!
245
	}
246

247
	unsafe {
248
		p := &u8(cr.mem_buf)
249
		cr.csv_map << []i64{}
250
		cr.csv_map[0] << if cr.is_bom_present { 3 } else { 0 } // skip the BOM data
251
		for i < cr.end_index {
252
			read_bytes_count := cr.fill_buffer(i)!
253
			// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
254
			mut p1 := p
255
			mut i1 := i64(0)
256
			for i1 < read_bytes_count {
257
				// println("loop char: ${*&u8(p1):c}")
258
				// manage quote char
259
				if *p1 == cr.quote {
260
					quote_flag = !quote_flag
261
					p1++
262
					i1++
263
				}
264
				else if // manage comment line
265
				 !quote_flag && *p1 == cr.comment && cr.csv_map[cr.csv_map.len - 1].len <= 1 {
266
					drop_row = true
267
					p1++
268
					i1++
269
					// println("drop_row: ${cr.csv_map.len - 1}")
270
				}
271
				else if // capture separator
272
				 !quote_flag && capture_flag && *p1 == cr.separator && !drop_row {
273
					cr.csv_map[cr.csv_map.len - 1] << (i + i1)
274

275
					p1 += cr.separator_len
276
					i1 += cr.separator_len
277
				}
278
				else if // capture end line
279
				 *p1 == cr.end_line {
280
					if quote_flag {
281
						error_col := cr.csv_map[cr.csv_map.len - 1].last() - cr.csv_map[cr.csv_map.len - 1].first()
282
						return error('ERROR: quote not closed at row ${count} after column ${error_col}!')
283
					}
284
					count++
285

286
					cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
287
					p1 += cr.end_line_len
288
					i1 += cr.end_line_len
289

290
					if drop_row == true {
291
						cr.csv_map[cr.csv_map.len - 1].clear()
292
						drop_row = false
293
					} else {
294
						// skip empty rows
295
						if cr.csv_map[cr.csv_map.len - 1].len == 2
296
							&& cr.csv_map[cr.csv_map.len - 1][0] == cr.csv_map[cr.csv_map.len - 1][1] {
297
							// recycle the row
298
							cr.csv_map[cr.csv_map.len - 1].clear()
299
						} else {
300
							// it all ok, insert a new row
301
							cr.csv_map << []i64{cap: cr.csv_map[cr.csv_map.len - 1].len}
302
						}
303
					}
304

305
					cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
306

307
					p1 -= (cr.end_line_len - 1)
308
					i1 -= (cr.end_line_len - 1)
309

310
					// DEBUG checks
311
					// r := &u8(cr.mem_buf) + (i + i1) - (cr.end_line_len - 1)
312
					// r := p1
313
					// println("char: ${*r:c}")
314
				} else {
315
					p1++
316
					i1++
317
				}
318
			}
319
			i += read_bytes_count
320
		}
321
	}
322
	// remove last row if it is not a valid one
323
	if cr.csv_map[cr.csv_map.len - 1].len < 2 {
324
		cr.csv_map.delete(cr.csv_map.len - 1)
325
	}
326

327
	// if File return to the start of the file
328
	if cr.mem_buf_type == file_csv {
329
		cr.f.seek(cr.start_index, .start)!
330
	}
331

332
	// println("map_csv Done! ${count}")
333
}
334

335
// get_row get a row from the CSV file as a string array
336
pub fn (mut cr RandomAccessReader) get_row(y int) ![]string {
337
	mut h := []string{}
338
	if cr.csv_map.len > 1 {
339
		for x in 0 .. (cr.csv_map[y].len - 1) {
340
			h << cr.get_cell(x: x, y: y)!
341
		}
342
	}
343
	return h
344
}
345

346
@[params]
347
pub struct GetCellConfig {
348
pub:
349
	x int
350
	y int
351
}
352

353
// get_cell read a single cel nd return a string
354
pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
355
	if cfg.y < cr.csv_map.len && cfg.x < (cr.csv_map[cfg.y].len - 1) {
356
		mut start := cr.csv_map[cfg.y][cfg.x]
357
		mut end := cr.csv_map[cfg.y][cfg.x + 1]
358

359
		if cfg.x > 0 {
360
			start++
361
		}
362

363
		mut len := end - start
364
		// println("len calc: ${len}")
365
		if len <= 0 {
366
			return cr.empty_cell
367
		}
368

369
		// fill the buffer if needed
370
		if !(start >= cr.mem_buf_start && end < cr.mem_buf_end) {
371
			cr.fill_buffer(start)!
372
		}
373
		unsafe {
374
			// execute this section only if we need to remove the quotes
375
			if cr.quote_remove {
376
				// println("[${start},${end}] len:${len}")
377
				// remove front quote and spaces
378
				mut tmp_p := &u8(cr.mem_buf) + start - cr.start_index
379
				for start < end {
380
					if *tmp_p == cr.quote {
381
						start++
382
						break
383
					}
384
					start++
385
					tmp_p++
386
				}
387
				// println("after start quote filtering [${start},${end}] len:${len}")
388
				// remove back quote and spaces
389
				tmp_p = &u8(cr.mem_buf) + end - cr.start_index
390
				for end > start {
391
					if *tmp_p == cr.quote {
392
						break
393
					}
394
					tmp_p--
395
					end--
396
				}
397
				// println("after end quote filtering [${start},${end}] len:${len}")
398

399
				len = end - start
400
				// println("len calc2: ${len}")
401
				if len <= 0 {
402
					return cr.empty_cell
403
				}
404
				// println("[${start},${end}] len:${len}")
405
			}
406

407
			// create the string from the buffer
408
			mut tmp_mem := malloc_noscan(isize(len + 1))
409
			/*
410
			defer {
411
				free(tmp_mem)
412
			}
413
			*/
414
			mem_start := &u8(cr.mem_buf) + start - cr.start_index
415
			vmemcpy(tmp_mem, mem_start, isize(len))
416
			tmp_mem[len] = 0 // 0 for C string compatibility
417
			ret_str := tos(tmp_mem, int(len))
418
			return ret_str
419
		}
420
	}
421
	return cr.default_cell
422
}
423

424
type CellValue = f32 | int | string
425

426
// get_cellt read a single cell and return a sum type CellValue
427
pub fn (mut cr RandomAccessReader) get_cellt(cfg GetCellConfig) !CellValue {
428
	if cr.header_row >= 0 && cfg.x < cr.header_list.len {
429
		h := cr.header_list[cfg.x]
430
		res := cr.get_cell(cfg)!
431
		if h.htype == .int {
432
			return res.int()
433
		}
434
		if h.htype == .f32 {
435
			return res.f32()
436
		}
437
		return res
438
	}
439
	return cr.get_cell(cfg)!
440
}
441

442
/******************************************************************************
443
*
444
* Header management
445
*
446
******************************************************************************/
447
@[params]
448
pub struct GetHeaderConf {
449
pub:
450
	header_row int // row where to inspect the header
451
}
452

453
// build_header_dict infer the header, it use the first available row in not row number is passesd
454
// it try to infer the type of column using the first available row after the header
455
// By default all the column are of the string type
456
pub fn (mut cr RandomAccessReader) build_header_dict(cfg GetHeaderConf) ! {
457
	if cr.csv_map.len > 1 && cfg.header_row >= 0 && cfg.header_row < cr.csv_map.len {
458
		cr.header_row = cfg.header_row
459
		for col in 0 .. (cr.csv_map[cfg.header_row].len - 1) {
460
			// fill the base struct
461
			label := cr.get_cell(x: col, y: cfg.header_row)!
462
			mut h := HeaderItem{
463
				label:  label
464
				column: col
465
				htype:  .string
466
			}
467

468
			// try to infer the type if we haev at least one more row
469
			if cfg.header_row + 1 < cr.csv_map.len {
470
				x := cr.get_cell(x: col, y: cfg.header_row + 1)!.trim_space().to_lower()
471
				mut sign_c := int(0)
472
				mut int_c := int(0)
473
				mut float_c := int(0)
474
				mut alpha_c := int(0)
475
				mut htype := ColumType.string
476
				// raw extimation fo the type
477
				for c in x {
478
					if c in [`+`, `-`] {
479
						sign_c++
480
						continue
481
					}
482
					if c >= `0` && c <= `9` {
483
						int_c++
484
						continue
485
					}
486
					if c == `.` {
487
						float_c++
488
						continue
489
					}
490
					if c in [`e`, `E`] && (float_c > 0 || int_c > 0) {
491
						float_c++
492
						continue
493
					}
494
					alpha_c++
495
					break
496
				}
497

498
				// if no alpha_c can be and int or a float
499
				if alpha_c == 0 {
500
					if float_c > 0 {
501
						htype = .f32
502
					} else {
503
						htype = .int
504
					}
505
				}
506
				h.htype = htype
507
			}
508

509
			cr.header_list << h
510
			cr.header_map[label] = col
511
		}
512
	}
513
}
514

515
/******************************************************************************
516
*
517
* Utility function
518
*
519
******************************************************************************/
520
// rows_count count the rows in the csv between start_index and end_index
521
pub fn (mut cr RandomAccessReader) rows_count() !i64 {
522
	mut count := i64(0)
523
	mut i := i64(0)
524

525
	if cr.mem_buf_type == file_csv {
526
		cr.f.seek(cr.start_index, .start)!
527
	}
528
	unsafe {
529
		p := &u8(cr.mem_buf)
530
		for i < cr.end_index {
531
			read_bytes_count := cr.fill_buffer(i)!
532
			// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
533
			mut p1 := p
534
			mut i1 := 0
535
			for i1 < read_bytes_count {
536
				if *p1 == cr.end_line {
537
					count++
538
				}
539
				p1++
540
				i1++
541
			}
542
			i += read_bytes_count
543
		}
544
	}
545
	if cr.mem_buf_type == file_csv {
546
		cr.f.seek(cr.start_index, .start)!
547
	}
548
	// println("rows_count Done!")
549
	return count
550
}
551

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.