v
Зеркало из https://github.com/vlang/v
1/*
2csv random access reader 1.0 alpha
3
4Copyright (c) 2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8Known limitations:
9- no stream reading
10*/
11module csv
12
13import os
14
15/******************************************************************************
16*
17* Consts
18*
19******************************************************************************/
20// endline lengths
21pub const endline_cr_len = 1
22pub const endline_crlf_len = 2
23
24// Type of read buffer
25pub const ram_csv = 1
26pub const file_csv = 0
27
28/******************************************************************************
29*
30* Structs
31*
32******************************************************************************/
33pub enum ColumType {
34string = 0
35int = 1
36f32 = 2
37}
38
39pub struct HeaderItem {
40pub mut:
41label string
42column int
43htype ColumType = .string
44}
45
46pub struct RandomAccessReader {
47pub mut:
48index i64
49
50f os.File
51f_len i64
52is_bom_present bool
53
54start_index i64
55end_index i64 = -1
56
57end_line u8 = `\n`
58end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
59separator u8 = `,` // comma is the default separator
60separator_len int = 1 // size of the separator rune
61quote u8 = `"` // double quote is the standard quote char
62quote_remove bool // if true clear the cell from the quotes
63comment u8 = `#` // every line that start with the quote char is ignored
64
65default_cell string = '*' // return this string if out of the csv boundaries
66empty_cell string = '#' // retunrn this if empty cell
67// ram buffer
68mem_buf_type u32 // buffer type 0=File,1=RAM
69mem_buf voidptr // buffer used to load chars from file
70mem_buf_size i64 // size of the buffer
71mem_buf_start i64 = -1 // start index in the file of the read buffer
72mem_buf_end i64 = -1 // end index in the file of the read buffer
73// csv map for quick access
74csv_map [][]i64
75// header
76header_row int = -1 // row index of the header in the csv_map
77header_list []HeaderItem // list of the header item
78header_map map[string]int // map from header label to column index
79}
80
81@[params]
82pub struct RandomAccessReaderConfig {
83pub:
84scr_buf voidptr // pointer to the buffer of data
85scr_buf_len i64 // if > 0 use the RAM pointed from scr_buf as source of data
86file_path string
87start_index i64
88end_index i64 = -1
89mem_buf_size int = 1024 * 64 // default buffer size 64KByte
90separator u8 = `,`
91comment u8 = `#` // every line that start with the quote char is ignored
92default_cell string = '*' // return this string if out of the csv boundaries
93empty_cell string // return this string if empty cell
94end_line_len int = endline_cr_len // size of the endline rune
95quote u8 = `"` // double quote is the standard quote char
96quote_remove bool // if true clear the cell from the quotes
97}
98
99/******************************************************************************
100*
101* Init, dispose, fill buffer
102*
103******************************************************************************/
104
105// csv_reader_from_string create a csv reader from a string
106pub fn csv_reader_from_string(in_str string) !&RandomAccessReader {
107return csv_reader(RandomAccessReaderConfig{ scr_buf: in_str.str, scr_buf_len: in_str.len })!
108}
109
110// csv_reader create a random access csv reader
111pub fn csv_reader(cfg RandomAccessReaderConfig) !&RandomAccessReader {
112mut cr := &RandomAccessReader{}
113
114cr.start_index = cfg.start_index
115cr.end_index = cfg.end_index
116
117if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
118cr.mem_buf_type = ram_csv // RAM buffer
119cr.mem_buf = cfg.scr_buf
120cr.mem_buf_size = cfg.scr_buf_len
121if cfg.end_index == -1 {
122cr.end_index = cfg.scr_buf_len
123}
124
125// check if BOM header is in the memory buffer
126unsafe {
127if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
128&& *(&u8(cr.mem_buf) + 2) == 0xBF {
129cr.is_bom_present = true
130cr.index += 3 // skip the BOM
131}
132}
133}
134// check if is a file source
135else if cfg.file_path.len > 0 {
136if !os.exists(cfg.file_path) {
137return error('ERROR: file ${cfg.file_path} not found!')
138}
139cr.mem_buf_type = file_csv // File buffer
140// allocate the memory
141unsafe {
142cr.mem_buf = malloc(cfg.mem_buf_size)
143cr.mem_buf_size = cfg.mem_buf_size
144}
145cr.f = os.open_file(cfg.file_path, 'rb')!
146
147cr.f.seek(0, .end)!
148cr.f_len = cr.f.tell()!
149
150cr.f.seek(cfg.start_index, .start)!
151cr.index = cr.f.tell()!
152
153if cfg.end_index == -1 {
154cr.end_index = cr.f_len
155}
156
157// check if BOM header is in the file
158if cr.index == 0 {
159if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
160unsafe {
161if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
162&& *(&u8(cr.mem_buf) + 2) == 0xBF {
163cr.is_bom_present = true
164cr.index += 3 // skip the BOM
165}
166}
167}
168cr.f.seek(cfg.start_index, .start)!
169}
170}
171
172cr.default_cell = cfg.default_cell
173cr.empty_cell = cfg.empty_cell
174cr.end_line_len = cfg.end_line_len
175cr.separator = cfg.separator
176cr.comment = cfg.comment
177cr.quote_remove = cfg.quote_remove
178cr.quote = cfg.quote
179
180cr.map_csv()!
181
182return cr
183}
184
185// dispose_csv_reader release the resources used by the csv_reader
186pub fn (mut cr RandomAccessReader) dispose_csv_reader() {
187if cr.mem_buf_type == ram_csv {
188// do nothing, ram buffer is static
189} else if cr.mem_buf_type == file_csv {
190// file close
191if cr.f.is_opened {
192cr.f.close()
193}
194
195// free the allocated memory
196if cr.mem_buf_size > 0 {
197unsafe {
198free(cr.mem_buf)
199}
200cr.mem_buf = unsafe { nil }
201cr.mem_buf_size = 0
202}
203}
204}
205
206fn (mut cr RandomAccessReader) fill_buffer(i i64) !i64 {
207// use ram
208if cr.mem_buf_type == ram_csv {
209// do nothing, ram buffer are static for now
210cr.mem_buf_start = i
211cr.mem_buf_end = cr.mem_buf_size
212read_bytes_count := cr.mem_buf_end - cr.mem_buf_start
213// println("fill_buffer RAM: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
214return i64(read_bytes_count)
215// use file
216} else if cr.mem_buf_type == file_csv {
217cr.start_index = i
218cr.f.seek(cr.start_index, .start)!
219// IMPORTANT: add 64 bit support in vlib!!
220read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
221cr.mem_buf_start = i
222cr.mem_buf_end = i + read_bytes_count
223// println("fill_buffer FILE: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
224return i64(read_bytes_count)
225}
226return i64(-1)
227}
228
229/******************************************************************************
230*
231* Csv mapper, mapped reader
232*
233******************************************************************************/
234// map_csv create an index of whole csv file to consent random access to every cell in the file
235pub fn (mut cr RandomAccessReader) map_csv() ! {
236mut count := 0
237mut i := i64(0)
238mut capture_flag := true
239mut drop_row := false
240mut quote_flag := false // true if we are parsing inside a quote
241
242// if File return to the start of the file
243if cr.mem_buf_type == file_csv {
244cr.f.seek(cr.start_index, .start)!
245}
246
247unsafe {
248p := &u8(cr.mem_buf)
249cr.csv_map << []i64{}
250cr.csv_map[0] << if cr.is_bom_present { 3 } else { 0 } // skip the BOM data
251for i < cr.end_index {
252read_bytes_count := cr.fill_buffer(i)!
253// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
254mut p1 := p
255mut i1 := i64(0)
256for i1 < read_bytes_count {
257// println("loop char: ${*&u8(p1):c}")
258// manage quote char
259if *p1 == cr.quote {
260quote_flag = !quote_flag
261p1++
262i1++
263}
264else if // manage comment line
265!quote_flag && *p1 == cr.comment && cr.csv_map[cr.csv_map.len - 1].len <= 1 {
266drop_row = true
267p1++
268i1++
269// println("drop_row: ${cr.csv_map.len - 1}")
270}
271else if // capture separator
272!quote_flag && capture_flag && *p1 == cr.separator && !drop_row {
273cr.csv_map[cr.csv_map.len - 1] << (i + i1)
274
275p1 += cr.separator_len
276i1 += cr.separator_len
277}
278else if // capture end line
279*p1 == cr.end_line {
280if quote_flag {
281error_col := cr.csv_map[cr.csv_map.len - 1].last() - cr.csv_map[cr.csv_map.len - 1].first()
282return error('ERROR: quote not closed at row ${count} after column ${error_col}!')
283}
284count++
285
286cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
287p1 += cr.end_line_len
288i1 += cr.end_line_len
289
290if drop_row == true {
291cr.csv_map[cr.csv_map.len - 1].clear()
292drop_row = false
293} else {
294// skip empty rows
295if cr.csv_map[cr.csv_map.len - 1].len == 2
296&& cr.csv_map[cr.csv_map.len - 1][0] == cr.csv_map[cr.csv_map.len - 1][1] {
297// recycle the row
298cr.csv_map[cr.csv_map.len - 1].clear()
299} else {
300// it all ok, insert a new row
301cr.csv_map << []i64{cap: cr.csv_map[cr.csv_map.len - 1].len}
302}
303}
304
305cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
306
307p1 -= (cr.end_line_len - 1)
308i1 -= (cr.end_line_len - 1)
309
310// DEBUG checks
311// r := &u8(cr.mem_buf) + (i + i1) - (cr.end_line_len - 1)
312// r := p1
313// println("char: ${*r:c}")
314} else {
315p1++
316i1++
317}
318}
319i += read_bytes_count
320}
321}
322// remove last row if it is not a valid one
323if cr.csv_map[cr.csv_map.len - 1].len < 2 {
324cr.csv_map.delete(cr.csv_map.len - 1)
325}
326
327// if File return to the start of the file
328if cr.mem_buf_type == file_csv {
329cr.f.seek(cr.start_index, .start)!
330}
331
332// println("map_csv Done! ${count}")
333}
334
335// get_row get a row from the CSV file as a string array
336pub fn (mut cr RandomAccessReader) get_row(y int) ![]string {
337mut h := []string{}
338if cr.csv_map.len > 1 {
339for x in 0 .. (cr.csv_map[y].len - 1) {
340h << cr.get_cell(x: x, y: y)!
341}
342}
343return h
344}
345
346@[params]
347pub struct GetCellConfig {
348pub:
349x int
350y int
351}
352
353// get_cell read a single cel nd return a string
354pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
355if cfg.y < cr.csv_map.len && cfg.x < (cr.csv_map[cfg.y].len - 1) {
356mut start := cr.csv_map[cfg.y][cfg.x]
357mut end := cr.csv_map[cfg.y][cfg.x + 1]
358
359if cfg.x > 0 {
360start++
361}
362
363mut len := end - start
364// println("len calc: ${len}")
365if len <= 0 {
366return cr.empty_cell
367}
368
369// fill the buffer if needed
370if !(start >= cr.mem_buf_start && end < cr.mem_buf_end) {
371cr.fill_buffer(start)!
372}
373unsafe {
374// execute this section only if we need to remove the quotes
375if cr.quote_remove {
376// println("[${start},${end}] len:${len}")
377// remove front quote and spaces
378mut tmp_p := &u8(cr.mem_buf) + start - cr.start_index
379for start < end {
380if *tmp_p == cr.quote {
381start++
382break
383}
384start++
385tmp_p++
386}
387// println("after start quote filtering [${start},${end}] len:${len}")
388// remove back quote and spaces
389tmp_p = &u8(cr.mem_buf) + end - cr.start_index
390for end > start {
391if *tmp_p == cr.quote {
392break
393}
394tmp_p--
395end--
396}
397// println("after end quote filtering [${start},${end}] len:${len}")
398
399len = end - start
400// println("len calc2: ${len}")
401if len <= 0 {
402return cr.empty_cell
403}
404// println("[${start},${end}] len:${len}")
405}
406
407// create the string from the buffer
408mut tmp_mem := malloc_noscan(isize(len + 1))
409/*
410defer {
411free(tmp_mem)
412}
413*/
414mem_start := &u8(cr.mem_buf) + start - cr.start_index
415vmemcpy(tmp_mem, mem_start, isize(len))
416tmp_mem[len] = 0 // 0 for C string compatibility
417ret_str := tos(tmp_mem, int(len))
418return ret_str
419}
420}
421return cr.default_cell
422}
423
424type CellValue = f32 | int | string
425
426// get_cellt read a single cell and return a sum type CellValue
427pub fn (mut cr RandomAccessReader) get_cellt(cfg GetCellConfig) !CellValue {
428if cr.header_row >= 0 && cfg.x < cr.header_list.len {
429h := cr.header_list[cfg.x]
430res := cr.get_cell(cfg)!
431if h.htype == .int {
432return res.int()
433}
434if h.htype == .f32 {
435return res.f32()
436}
437return res
438}
439return cr.get_cell(cfg)!
440}
441
442/******************************************************************************
443*
444* Header management
445*
446******************************************************************************/
447@[params]
448pub struct GetHeaderConf {
449pub:
450header_row int // row where to inspect the header
451}
452
453// build_header_dict infer the header, it use the first available row in not row number is passesd
454// it try to infer the type of column using the first available row after the header
455// By default all the column are of the string type
456pub fn (mut cr RandomAccessReader) build_header_dict(cfg GetHeaderConf) ! {
457if cr.csv_map.len > 1 && cfg.header_row >= 0 && cfg.header_row < cr.csv_map.len {
458cr.header_row = cfg.header_row
459for col in 0 .. (cr.csv_map[cfg.header_row].len - 1) {
460// fill the base struct
461label := cr.get_cell(x: col, y: cfg.header_row)!
462mut h := HeaderItem{
463label: label
464column: col
465htype: .string
466}
467
468// try to infer the type if we haev at least one more row
469if cfg.header_row + 1 < cr.csv_map.len {
470x := cr.get_cell(x: col, y: cfg.header_row + 1)!.trim_space().to_lower()
471mut sign_c := int(0)
472mut int_c := int(0)
473mut float_c := int(0)
474mut alpha_c := int(0)
475mut htype := ColumType.string
476// raw extimation fo the type
477for c in x {
478if c in [`+`, `-`] {
479sign_c++
480continue
481}
482if c >= `0` && c <= `9` {
483int_c++
484continue
485}
486if c == `.` {
487float_c++
488continue
489}
490if c in [`e`, `E`] && (float_c > 0 || int_c > 0) {
491float_c++
492continue
493}
494alpha_c++
495break
496}
497
498// if no alpha_c can be and int or a float
499if alpha_c == 0 {
500if float_c > 0 {
501htype = .f32
502} else {
503htype = .int
504}
505}
506h.htype = htype
507}
508
509cr.header_list << h
510cr.header_map[label] = col
511}
512}
513}
514
515/******************************************************************************
516*
517* Utility function
518*
519******************************************************************************/
520// rows_count count the rows in the csv between start_index and end_index
521pub fn (mut cr RandomAccessReader) rows_count() !i64 {
522mut count := i64(0)
523mut i := i64(0)
524
525if cr.mem_buf_type == file_csv {
526cr.f.seek(cr.start_index, .start)!
527}
528unsafe {
529p := &u8(cr.mem_buf)
530for i < cr.end_index {
531read_bytes_count := cr.fill_buffer(i)!
532// println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
533mut p1 := p
534mut i1 := 0
535for i1 < read_bytes_count {
536if *p1 == cr.end_line {
537count++
538}
539p1++
540i1++
541}
542i += read_bytes_count
543}
544}
545if cr.mem_buf_type == file_csv {
546cr.f.seek(cr.start_index, .start)!
547}
548// println("rows_count Done!")
549return count
550}
551