v

csv_reader_sequential.v
298 строк · 7.5 Кб
Перенос по словам
1
/*
2
csv serial reader 1.0 alpha
3

4
Copyright (c) 2023 Dario Deledda. All rights reserved.
5
Use of this source code is governed by an MIT license
6
that can be found in the LICENSE file.
7

8
Known limitations:
9
*/
10
module csv
11

12
import os
13

14
@[params]
15
pub struct SequentialReaderConfig {
16
pub:
17
	scr_buf      voidptr // pointer to the buffer of data
18
	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
19
	file_path    string
20
	start_index  i64
21
	end_index    i64    = -1
22
	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
23
	separator    u8     = `,`
24
	comment      u8     = `#` // every line that start with the comment char is ignored
25
	default_cell string = '*' // return this string if out of the csv boundaries
26
	empty_cell   string // return this string if empty cell
27
	end_line_len int = endline_cr_len // size of the endline rune
28
	quote        u8  = `"`            // double quote is the standard quote char
29
}
30

31
pub struct SequentialReader {
32
pub mut:
33
	index i64
34

35
	f              os.File
36
	f_len          i64
37
	is_bom_present bool
38

39
	start_index i64
40
	end_index   i64 = -1
41

42
	end_line      u8  = `\n`
43
	end_line_len  int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
44
	separator     u8  = `,`            // comma is the default separator
45
	separator_len int = 1              // size of the separator rune
46
	quote         u8  = `"`            // double quote is the standard quote char
47

48
	comment u8 = `#` // every line that start with the quote char is ignored
49

50
	default_cell string = '*' // return this string if out of the csv boundaries
51
	empty_cell   string = '#' // retunrn this if empty cell
52
	// ram buffer
53
	mem_buf_type  u32     // buffer type 0=File,1=RAM
54
	mem_buf       voidptr // buffer used to load chars from file
55
	mem_buf_size  i64     // size of the buffer
56
	mem_buf_start i64 = -1 // start index in the file of the read buffer
57
	mem_buf_end   i64 = -1 // end index in the file of the read buffer
58

59
	ch_buf []u8 = []u8{cap: 1024}
60
	// error management
61
	row_count i64
62
	col_count i64
63
}
64

65
// csv_sequential_reader creates a sequential csv reader
66
pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
67
	mut cr := &SequentialReader{}
68

69
	cr.start_index = cfg.start_index
70
	cr.end_index = cfg.end_index
71

72
	// reading from a RAM buffer
73
	if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
74
		cr.mem_buf_type = ram_csv // RAM buffer
75
		cr.mem_buf = cfg.scr_buf
76
		cr.mem_buf_size = cfg.scr_buf_len
77
		if cfg.end_index == -1 {
78
			cr.end_index = cfg.scr_buf_len
79
		}
80

81
		// check if BOM header is in the memory buffer
82
		unsafe {
83
			if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
84
				&& *(&u8(cr.mem_buf) + 2) == 0xBF {
85
				cr.is_bom_present = true
86
				cr.index += 3 // skip the BOM
87
				cr.start_index += 3 // skip the BOM
88
			}
89
		}
90
		cr.mem_buf_start = 0
91
		cr.mem_buf_end = cr.mem_buf_size
92

93
		// check if is a file source
94
	} else if cfg.file_path.len > 0 {
95
		if !os.exists(cfg.file_path) {
96
			return error('ERROR: file ${cfg.file_path} not found!')
97
		}
98
		cr.mem_buf_type = file_csv // File buffer
99
		// allocate the memory
100
		unsafe {
101
			cr.mem_buf = malloc(cfg.mem_buf_size)
102
			cr.mem_buf_size = cfg.mem_buf_size
103
		}
104
		cr.f = os.open_file(cfg.file_path, 'rb')!
105

106
		cr.f.seek(0, .end)!
107
		cr.f_len = cr.f.tell()!
108

109
		cr.f.seek(cfg.start_index, .start)!
110
		cr.index = cr.f.tell()!
111

112
		if cfg.end_index == -1 {
113
			cr.end_index = cr.f_len
114
		}
115

116
		// check if BOM header is in the file
117
		if cr.index == 0 {
118
			if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
119
				unsafe {
120
					if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
121
						&& *(&u8(cr.mem_buf) + 2) == 0xBF {
122
						cr.is_bom_present = true
123
						cr.index += 3 // skip the BOM
124
						cr.start_index += 3 // skip the BOM
125
					}
126
				}
127
			}
128
			cr.f.seek(cfg.start_index, .start)!
129
		}
130
	}
131

132
	cr.default_cell = cfg.default_cell
133
	cr.empty_cell = cfg.empty_cell
134
	cr.end_line_len = cfg.end_line_len
135
	cr.separator = cfg.separator
136
	cr.comment = cfg.comment
137
	cr.quote = cfg.quote
138

139
	return cr
140
}
141

142
// dispose_csv_reader release the resources used by the csv_reader
143
pub fn (mut cr SequentialReader) dispose_csv_reader() {
144
	if cr.mem_buf_type == ram_csv {
145
		// do nothing, ram buffer is static
146
	} else if cr.mem_buf_type == file_csv {
147
		// file close
148
		if cr.f.is_opened {
149
			cr.f.close()
150
		}
151

152
		// free the allocated memory
153
		if cr.mem_buf_size > 0 {
154
			unsafe {
155
				free(cr.mem_buf)
156
			}
157
			cr.mem_buf = unsafe { nil }
158
			cr.mem_buf_size = 0
159
		}
160
	}
161
}
162

163
// has_data return the bytes available for future readings
164
pub fn (mut cr SequentialReader) has_data() i64 {
165
	return cr.end_index - cr.start_index
166
}
167

168
fn (mut cr SequentialReader) fill_buffer(index i64) ! {
169
	if cr.mem_buf_type == ram_csv {
170
		// for now do nothing if ram buffer
171
	} else {
172
		cr.f.seek(index, .start)!
173
		// IMPORTANT: add 64 bit support in vlib!!
174
		read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
175
		cr.mem_buf_start = index
176
		cr.mem_buf_end = index + read_bytes_count
177
	}
178
}
179

180
enum SequentialReadingState as u16 {
181
	comment
182
	quote
183
	after_quote
184
	cell
185
	newline
186
}
187

188
// get_next_row get the next row from the CSV file as a string array
189
pub fn (mut cr SequentialReader) get_next_row() ![]string {
190
	mut row_res := []string{}
191
	// clear the cell buffer
192
	cr.ch_buf.clear()
193
	mut i := cr.start_index
194
	mut state := SequentialReadingState.cell
195

196
	p := &u8(cr.mem_buf)
197
	for i < cr.end_index {
198
		if i < cr.mem_buf_start || i >= cr.mem_buf_end {
199
			cr.fill_buffer(i)!
200
		}
201
		unsafe {
202
			ch := *(p + i - cr.mem_buf_start)
203

204
			if state == .cell {
205
				if ch == cr.separator {
206
					// must be optimized
207
					cr.ch_buf << 0
208
					row_res << if (cr.ch_buf.len - 1) == 0 {
209
						cr.empty_cell
210
					} else {
211
						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
212
					}
213
					cr.ch_buf.clear()
214
				} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
215
					state = .comment
216
				} else if ch == cr.quote {
217
					state = .quote
218
					cr.ch_buf.clear()
219
					cr.col_count++
220
					i++
221
					continue
222
				} else if ch == cr.end_line {
223
					cr.row_count++
224
					cr.col_count = 0
225

226
					// skip empty rows
227
					if !(row_res.len == 0 && cr.ch_buf.len < 1) {
228
						cr.ch_buf << 0
229
						row_res << if (cr.ch_buf.len - 1) == 0 {
230
							cr.empty_cell
231
						} else {
232
							(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
233
						}
234
						i += cr.end_line_len - 1
235
						break
236
					}
237
				} else if ch == `\r` && cr.end_line_len == 2 {
238
					// skip CR
239
				} else { // normal char inside a cell
240
					cr.ch_buf << ch
241
				}
242
			}
243

244
			if state == .comment {
245
				if cr.ch_buf.len > 0 {
246
					// must be optimized
247
					cr.ch_buf << 0
248
					row_res << if (cr.ch_buf.len - 1) == 0 {
249
						cr.empty_cell
250
					} else {
251
						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
252
					}
253
					cr.ch_buf.clear()
254
				} else if ch == cr.end_line {
255
					state = .cell
256
				}
257
			}
258

259
			if state == .quote {
260
				if ch == cr.quote {
261
					// must be optimized
262
					cr.ch_buf << 0
263
					row_res << if (cr.ch_buf.len - 1) == 0 {
264
						cr.empty_cell
265
					} else {
266
						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
267
					}
268
					cr.ch_buf.clear()
269

270
					state = .after_quote
271
					cr.col_count++
272
					i++
273
					continue
274
				} else if ch == cr.end_line {
275
					return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
276
				} else { // normal char inside a quote inside a cell
277
					cr.ch_buf << ch
278
				}
279
			}
280

281
			if state == .after_quote {
282
				if ch == cr.separator {
283
					state = .cell
284
				} else if ch == cr.end_line {
285
					cr.row_count++
286
					cr.col_count = 0
287
					cr.ch_buf.clear()
288
					i += cr.end_line_len - 1
289
					break
290
				}
291
			}
292
		}
293
		cr.col_count++
294
		i++
295
	}
296
	cr.start_index = i
297
	return row_res
298
}
299
v

Использование cookies