v
Зеркало из https://github.com/vlang/v
1/*
2csv serial reader 1.0 alpha
3
4Copyright (c) 2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8Known limitations:
9*/
10module csv
11
12import os
13
14@[params]
15pub struct SequentialReaderConfig {
16pub:
17scr_buf voidptr // pointer to the buffer of data
18scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data
19file_path string
20start_index i64
21end_index i64 = -1
22mem_buf_size int = 1024 * 64 // default buffer size 64KByte
23separator u8 = `,`
24comment u8 = `#` // every line that start with the comment char is ignored
25default_cell string = '*' // return this string if out of the csv boundaries
26empty_cell string // return this string if empty cell
27end_line_len int = endline_cr_len // size of the endline rune
28quote u8 = `"` // double quote is the standard quote char
29}
30
31pub struct SequentialReader {
32pub mut:
33index i64
34
35f os.File
36f_len i64
37is_bom_present bool
38
39start_index i64
40end_index i64 = -1
41
42end_line u8 = `\n`
43end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
44separator u8 = `,` // comma is the default separator
45separator_len int = 1 // size of the separator rune
46quote u8 = `"` // double quote is the standard quote char
47
48comment u8 = `#` // every line that start with the quote char is ignored
49
50default_cell string = '*' // return this string if out of the csv boundaries
51empty_cell string = '#' // retunrn this if empty cell
52// ram buffer
53mem_buf_type u32 // buffer type 0=File,1=RAM
54mem_buf voidptr // buffer used to load chars from file
55mem_buf_size i64 // size of the buffer
56mem_buf_start i64 = -1 // start index in the file of the read buffer
57mem_buf_end i64 = -1 // end index in the file of the read buffer
58
59ch_buf []u8 = []u8{cap: 1024}
60// error management
61row_count i64
62col_count i64
63}
64
65// csv_sequential_reader creates a sequential csv reader
66pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
67mut cr := &SequentialReader{}
68
69cr.start_index = cfg.start_index
70cr.end_index = cfg.end_index
71
72// reading from a RAM buffer
73if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
74cr.mem_buf_type = ram_csv // RAM buffer
75cr.mem_buf = cfg.scr_buf
76cr.mem_buf_size = cfg.scr_buf_len
77if cfg.end_index == -1 {
78cr.end_index = cfg.scr_buf_len
79}
80
81// check if BOM header is in the memory buffer
82unsafe {
83if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
84&& *(&u8(cr.mem_buf) + 2) == 0xBF {
85cr.is_bom_present = true
86cr.index += 3 // skip the BOM
87cr.start_index += 3 // skip the BOM
88}
89}
90cr.mem_buf_start = 0
91cr.mem_buf_end = cr.mem_buf_size
92
93// check if is a file source
94} else if cfg.file_path.len > 0 {
95if !os.exists(cfg.file_path) {
96return error('ERROR: file ${cfg.file_path} not found!')
97}
98cr.mem_buf_type = file_csv // File buffer
99// allocate the memory
100unsafe {
101cr.mem_buf = malloc(cfg.mem_buf_size)
102cr.mem_buf_size = cfg.mem_buf_size
103}
104cr.f = os.open_file(cfg.file_path, 'rb')!
105
106cr.f.seek(0, .end)!
107cr.f_len = cr.f.tell()!
108
109cr.f.seek(cfg.start_index, .start)!
110cr.index = cr.f.tell()!
111
112if cfg.end_index == -1 {
113cr.end_index = cr.f_len
114}
115
116// check if BOM header is in the file
117if cr.index == 0 {
118if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
119unsafe {
120if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
121&& *(&u8(cr.mem_buf) + 2) == 0xBF {
122cr.is_bom_present = true
123cr.index += 3 // skip the BOM
124cr.start_index += 3 // skip the BOM
125}
126}
127}
128cr.f.seek(cfg.start_index, .start)!
129}
130}
131
132cr.default_cell = cfg.default_cell
133cr.empty_cell = cfg.empty_cell
134cr.end_line_len = cfg.end_line_len
135cr.separator = cfg.separator
136cr.comment = cfg.comment
137cr.quote = cfg.quote
138
139return cr
140}
141
142// dispose_csv_reader release the resources used by the csv_reader
143pub fn (mut cr SequentialReader) dispose_csv_reader() {
144if cr.mem_buf_type == ram_csv {
145// do nothing, ram buffer is static
146} else if cr.mem_buf_type == file_csv {
147// file close
148if cr.f.is_opened {
149cr.f.close()
150}
151
152// free the allocated memory
153if cr.mem_buf_size > 0 {
154unsafe {
155free(cr.mem_buf)
156}
157cr.mem_buf = unsafe { nil }
158cr.mem_buf_size = 0
159}
160}
161}
162
163// has_data return the bytes available for future readings
164pub fn (mut cr SequentialReader) has_data() i64 {
165return cr.end_index - cr.start_index
166}
167
168fn (mut cr SequentialReader) fill_buffer(index i64) ! {
169if cr.mem_buf_type == ram_csv {
170// for now do nothing if ram buffer
171} else {
172cr.f.seek(index, .start)!
173// IMPORTANT: add 64 bit support in vlib!!
174read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
175cr.mem_buf_start = index
176cr.mem_buf_end = index + read_bytes_count
177}
178}
179
180enum SequentialReadingState as u16 {
181comment
182quote
183after_quote
184cell
185newline
186}
187
188// get_next_row get the next row from the CSV file as a string array
189pub fn (mut cr SequentialReader) get_next_row() ![]string {
190mut row_res := []string{}
191// clear the cell buffer
192cr.ch_buf.clear()
193mut i := cr.start_index
194mut state := SequentialReadingState.cell
195
196p := &u8(cr.mem_buf)
197for i < cr.end_index {
198if i < cr.mem_buf_start || i >= cr.mem_buf_end {
199cr.fill_buffer(i)!
200}
201unsafe {
202ch := *(p + i - cr.mem_buf_start)
203
204if state == .cell {
205if ch == cr.separator {
206// must be optimized
207cr.ch_buf << 0
208row_res << if (cr.ch_buf.len - 1) == 0 {
209cr.empty_cell
210} else {
211(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
212}
213cr.ch_buf.clear()
214} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
215state = .comment
216} else if ch == cr.quote {
217state = .quote
218cr.ch_buf.clear()
219cr.col_count++
220i++
221continue
222} else if ch == cr.end_line {
223cr.row_count++
224cr.col_count = 0
225
226// skip empty rows
227if !(row_res.len == 0 && cr.ch_buf.len < 1) {
228cr.ch_buf << 0
229row_res << if (cr.ch_buf.len - 1) == 0 {
230cr.empty_cell
231} else {
232(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
233}
234i += cr.end_line_len - 1
235break
236}
237} else if ch == `\r` && cr.end_line_len == 2 {
238// skip CR
239} else { // normal char inside a cell
240cr.ch_buf << ch
241}
242}
243
244if state == .comment {
245if cr.ch_buf.len > 0 {
246// must be optimized
247cr.ch_buf << 0
248row_res << if (cr.ch_buf.len - 1) == 0 {
249cr.empty_cell
250} else {
251(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
252}
253cr.ch_buf.clear()
254} else if ch == cr.end_line {
255state = .cell
256}
257}
258
259if state == .quote {
260if ch == cr.quote {
261// must be optimized
262cr.ch_buf << 0
263row_res << if (cr.ch_buf.len - 1) == 0 {
264cr.empty_cell
265} else {
266(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
267}
268cr.ch_buf.clear()
269
270state = .after_quote
271cr.col_count++
272i++
273continue
274} else if ch == cr.end_line {
275return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
276} else { // normal char inside a quote inside a cell
277cr.ch_buf << ch
278}
279}
280
281if state == .after_quote {
282if ch == cr.separator {
283state = .cell
284} else if ch == cr.end_line {
285cr.row_count++
286cr.col_count = 0
287cr.ch_buf.clear()
288i += cr.end_line_len - 1
289break
290}
291}
292}
293cr.col_count++
294i++
295}
296cr.start_index = i
297return row_res
298}
299