tsv_utils.csv2tsv source code

1 /**
2 Convert CSV formatted data to TSV format.
3 
4 This program converts comma-separated value data to tab-separated format.
5 
6 Copyright (c) 2016-2020, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 
12 module tsv_utils.csv2tsv;
13 
14 import std.stdio;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.traits : isArray, Unqual;
19 import std.typecons : tuple;
20 
21 immutable helpText = q"EOS
22 Synopsis: csv2tsv [options] [file...]
23 
24 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
25 are read from files or standard input, converted records written to standard output.
26 Use '--help-verbose' for details the CSV formats accepted.
27 
28 Options:
29 EOS";
30 
31 immutable helpTextVerbose = q"EOS
32 Synopsis: csv2tsv [options] [file...]
33 
34 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
35 are read from files or standard input, converted records written to standard output.
36 
37 Both formats represent tabular data, each record on its own line, fields separated
38 by a delimiter character. The key difference is that CSV uses escape sequences to
39 represent newlines and field separators in the data, whereas TSV disallows these
40 characters in the data. The most common field delimiters are comma for CSV and tab
41 for TSV, but any character can be used.
42 
43 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
44 and replacing newlines and field delimiters in the data. By default, newlines and
45 field delimiters in the data are replaced by spaces. Most details are customizable.
46 
47 There is no single spec for CSV, any number of variants can be found. The escape
48 syntax is common enough: fields containing newlines or field delimiters are placed
49 in double quotes. Inside a quoted field, a double quote is represented by a pair of
50 double quotes. As with field separators, the quoting character is customizable.
51 
52 Behaviors of this program that often vary between CSV implementations:
53   * Newlines are supported in quoted fields.
54   * Double quotes are permitted in a non-quoted field. However, a field starting
55     with a quote must follow quoting rules.
56   * Each record can have a different numbers of fields.
57   * The three common forms of newlines are supported: CR, CRLF, LF. Output is
58     written using Unix newlines (LF).
59   * A newline will be added if the file does not end with one.
60   * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed.
61   * No whitespace trimming is done.
62 
63 This program does not validate CSV correctness, but will terminate with an error
64 upon reaching an inconsistent state. Improperly terminated quoted fields are the
65 primary cause.
66 
67 UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
68 
69 Options:
70 EOS";
71 
72 /** Container for command line options.
73  */
74 struct Csv2tsvOptions
75 {
76     string programName;
77     bool helpVerbose = false;          // --help-verbose
78     bool hasHeader = false;            // --H|header
79     char csvQuoteChar = '"';           // --q|quote
80     char csvDelimChar = ',';           // --c|csv-delim
81     char tsvDelimChar = '\t';          // --t|tsv-delim
82     string tsvDelimReplacement = " ";  // --r|tab-replacement
83     string newlineReplacement = " ";   // --n|newline-replacement
84     bool versionWanted = false;        // --V|version
85 
86     auto processArgs (ref string[] cmdArgs)
87     {
88         import std.algorithm : canFind;
89         import std.getopt;
90         import std.path : baseName, stripExtension;
91 
92         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
93 
94         try
95         {
96             auto r = getopt(
97                 cmdArgs,
98                 "help-verbose",          "     Print full help.", &helpVerbose,
99                 std.getopt.config.caseSensitive,
100                 "H|header",              "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
101                 std.getopt.config.caseSensitive,
102                 "q|quote",               "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
103                 "c|csv-delim",           "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
104                 "t|tsv-delim",           "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
105                 "r|tab-replacement",     "STR  Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement,
106                 "n|newline-replacement", "STR  Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement,
107                 std.getopt.config.caseSensitive,
108                 "V|version",             "     Print version information and exit.", &versionWanted,
109                 std.getopt.config.caseInsensitive,
110                 );
111 
112             if (r.helpWanted)
113             {
114                 defaultGetoptPrinter(helpText, r.options);
115                 return tuple(false, 0);
116             }
117             else if (helpVerbose)
118             {
119                 defaultGetoptPrinter(helpTextVerbose, r.options);
120                 return tuple(false, 0);
121             }
122             else if (versionWanted)
123             {
124                 import tsv_utils.common.tsvutils_version;
125                 writeln(tsvutilsVersionNotice("csv2tsv"));
126                 return tuple(false, 0);
127             }
128 
129             /* Consistency checks. */
130             enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r',
131                     "CSV quote character cannot be newline (--q|quote).");
132 
133             enforce(csvQuoteChar != csvDelimChar,
134                     "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
135 
136             enforce(csvQuoteChar != tsvDelimChar,
137                     "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
138 
139             enforce(csvDelimChar != '\n' && csvDelimChar != '\r',
140                     "CSV field delimiter cannot be newline (--c|csv-delim).");
141 
142             enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r',
143                     "TSV field delimiter cannot be newline (--t|tsv-delim).");
144 
145             enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement),
146                     "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement).");
147 
148             enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement),
149                     "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement).");
150         }
151         catch (Exception exc)
152         {
153             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
154             return tuple(false, 1);
155         }
156         return tuple(true, 0);
157     }
158 }
159 
160 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
161 
162 version(unittest)
163 {
164     // No main in unittest
165 }
166 else
167 {
168     int main(string[] cmdArgs)
169     {
170         /* When running in DMD code coverage mode, turn on report merging. */
171         version(D_Coverage) version(DigitalMars)
172         {
173             import core.runtime : dmd_coverSetMerge;
174             dmd_coverSetMerge(true);
175         }
176 
177         Csv2tsvOptions cmdopt;
178         const r = cmdopt.processArgs(cmdArgs);
179         if (!r[0]) return r[1];
180         version(LDC_Profile)
181         {
182             import ldc.profile : resetAll;
183             resetAll();
184         }
185         try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
186         catch (Exception exc)
187         {
188             writeln();
189             stdin.flush();
190             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
191             return 1;
192         }
193 
194         return 0;
195     }
196 }
197 
198 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles)
199 {
200     import tsv_utils.common.utils : BufferedOutputRange;
201 
202     ubyte[1024 * 128] fileRawBuf;
203     auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout);
204     bool firstFile = true;
205 
206     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
207     {
208         auto inputStream = (filename == "-") ? stdin : filename.File;
209         auto printFileName = (filename == "-") ? "stdin" : filename;
210 
211         auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1;
212 
213         csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines,
214                 cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
215                 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
216                 cmdopt.newlineReplacement);
217 
218         firstFile = false;
219     }
220 }
221 
222 /* csv2tsv buffered conversion approach
223 
224 This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a
225 change from the original version, which used a character-at-a-time approach, with
226 characters coming from an infinite stream of characters. The character-at-a-time
227 approach was nice from a simplicity perspective, but the approach didn't optimize well.
228 Note that the original version read input in blocks and wrote to stdout in blocks, it
229 was the conversion algorithm itself that was character oriented.
230 
231 The idea is to convert a buffer at a time, writing larger blocks to the output stream
232 rather than one character at a time. In addition, the read buffer is modified in-place
233 when the only change is to convert a single character. The notable case is converting
234 the field delimiter character, typically comma to TAB. The result is writing longer
235 blocks to the output stream (BufferedOutputRange).
236 
237 Performance improvements from the new algorithm are notable. This is especially true
238 versus the previous version 2.0.0. Note though that the more recent versions of
239 csv2tsv were slower due to degradations coming from compiler and/or language version.
240 Version 1.1.19 was quite a bit faster. Regardless of version, the performance
241 improvement is especially good when run against "simple" CSV files, with limited
242 amounts of CSV escape syntax. In these files the main change is converting the field
243 delimiter character, typically comma to TAB.
244 
245 In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on
246 files with significant CSV escapes, and 60% faster on files with limited CSV escapes.
247 Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same
248 files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster,
249 by about 20%. But getting into the 'tr' ballpark while retaining safety of correct
250 csv2tsv conversion is a good result.
251 
252 Algorithm notes:
253 
254 The algorithm works by reading an input block, then examining each byte in-order to
255 identify needed modifications. The region of consecutive characters without a change
256 is tracked. Single character changes are done in-place, in the read buffer. This
257 allows assembling longer blocks before write is needed. The region being tracked is
258 written to the output stream when it can no longer be extended in a continuous
259 fashion. At this point a new region is started. When the current read buffer has
260 been processed the current region is written out and a new block of data read in.
261 
262 The read buffer uses fixed size blocks. This means the algorithm is actually
263 operating on bytes (UTF-8 code units), and not characters. This works because all
264 delimiters and CSV escape syntax characters are single byte UTF-8 characters. These
265 are the only characters requiring interpretation. The main nuisance is the 2-byte
266 CRLF newline sequence, as this might be split across two read buffers. This is
267 handled by embedding 'CR' states in the finite state machine.
268 
269 Processing CSV escapes will often cause the character removals and additions. These
270 will not be representable in a continuous stream of bytes without moving bytes around
271 Instead of moving bytes, these cases are handled by immediately  writing to the output
272 stream. This allows restarting a new block of contiguous characters. Handling by the
273 new algorithm is described below. Note that the length of the replacement characters
274 for TSV field and record delimiters (e.g. TAB, newline) affects the processing.
275 
276 All replacement character lengths:
277 
278 * Windows newline (CRLF) at the end of a line - Replace the CRLF with LF.
279 
280   Replace the CR with LF, add it to the current write region and terminate it. The
281   next write region starts at the character after the LF.
282 
283 * Double quote starting or ending a field - Drop the double quote.
284 
285   Terminate the current write region, next write region starts at the next character.
286 
287 * Double quote pair inside a quoted field - Drop one of the double quotes.
288 
289   The algorithm drops the first double quote and keep the second. This avoids
290   look-ahead and both field terminating double quote and double quote pair can
291   handled the same way. Terminate the current write region without adding the double
292   quote. The next write region starts at the next character.
293 
294 Single byte replacement characters:
295 
296 * Windows newline (CRLF) in a quoted field
297 
298   Replace the CR with the replacement char, add it to the current write region and
299   terminate it. The next write region starts at the character after the LF.
300 
301 Multi-byte replacement sequences:
302 
303 * TSV Delimiter (TAB by default) in a field
304 
305   Terminate the current write region, write it out and the replacement. The next
306   write region starts at the next character.
307 
308 * LF, CR, or CRLF in a quoted field
309 
310   Terminate the current write region, write it and the replacement. The next write
311   region starts at the next character.
312 
313 csv2tsv API
314 
315 At the API level, it is desirable to handle at both open files and input streams.
316 Open files are the key requirement, but handling input streams simplifies unit
317 testing, and in-memory conversion is likely to be useful anyway. Internally, it
318 should be easy enough to encapsulate the differences between input streams and files.
319 Reading files can be done using File.byChunk and reading from input streams can be
320 done using std.range.chunks.
321 
322 This has been handled by creating a new range that can iterate either files or
323 input streams chunk-by-chunk.
324 */
325 
326 /** Defines the 'bufferable' input sources supported by inputSourceByChunk.
327  *
328  * This includes std.stdio.File objects and mutable dynamic ubyte arrays (inputRange
329  * with slicing).
330  *
331  * Note: The mutable, dynamic arrays restriction is based on what is supported by
332  * std.range.chunks. This could be extended to include any type of array with ubyte
333  * elements, but it would require custom code in inputSourceByChunk. A test could be
334  * added as '(isArray!(R) && is(Unqual!(typeof(R.init[0])) == ubyte))'.
335  */
336 enum bool isBufferableInputSource(R) =
337     isFileHandle!(Unqual!R) ||
338     (isInputRange!R && is(ElementEncodingType!R == ubyte) && hasSlicing!R);
339 
340 @safe unittest
341 {
342     static assert(isBufferableInputSource!(File));
343     static assert(isBufferableInputSource!(typeof(stdin)));
344     static assert(isBufferableInputSource!(ubyte[]));
345     static assert(!isBufferableInputSource!(char[]));
346     static assert(!isBufferableInputSource!(string));
347 
348     ubyte[10] x1;
349     const ubyte[1] x2;
350     immutable ubyte[1] x3;
351     ubyte[] x4 = new ubyte[](10);
352     const ubyte[] x5 = new ubyte[](10);
353     immutable ubyte[] x6 = new ubyte[](10);
354 
355     static assert(!isBufferableInputSource!(typeof(x1)));
356     static assert(!isBufferableInputSource!(typeof(x2)));
357     static assert(!isBufferableInputSource!(typeof(x3)));
358     static assert(isBufferableInputSource!(typeof(x4)));
359     static assert(!isBufferableInputSource!(typeof(x5)));
360     static assert(!isBufferableInputSource!(typeof(x6)));
361 
362     static assert(is(Unqual!(ElementType!(typeof(x1))) == ubyte));
363     static assert(is(Unqual!(ElementType!(typeof(x2))) == ubyte));
364     static assert(is(Unqual!(ElementType!(typeof(x3))) == ubyte));
365     static assert(is(Unqual!(ElementType!(typeof(x4))) == ubyte));
366     static assert(is(Unqual!(ElementType!(typeof(x5))) == ubyte));
367     static assert(is(Unqual!(ElementType!(typeof(x6))) == ubyte));
368 
369     struct S1
370     {
371         void popFront();
372         @property bool empty();
373         @property ubyte front();
374     }
375 
376     struct S2
377     {
378         @property ubyte front();
379         void popFront();
380         @property bool empty();
381         @property auto save() { return this; }
382         @property size_t length();
383         S2 opSlice(size_t, size_t);
384     }
385 
386     static assert(isInputRange!S1);
387     static assert(!isBufferableInputSource!S1);
388 
389     static assert(isInputRange!S2);
390     static assert(is(ElementEncodingType!S2 == ubyte));
391     static assert(hasSlicing!S2);
392     static assert(isBufferableInputSource!S2);
393 
394     /* For code coverage. */
395     S2 s2;
396     auto x = s2.save;
397 }
398 
399 /** inputSourceByChunk returns a range that reads either a file handle (File) or a
400  * ubyte[] array a chunk at a time.
401  *
402  * This is a cover for File.byChunk that allows passing an in-memory array as well.
403  * At present the motivation is primarily to enable unit testing of chunk-based
404  * algorithms using in-memory strings. At present the in-memory input types are
405  * limited. In the future this may be changed to accept any type of character or
406  * ubyte array.
407  *
408  * inputSourceByChunk takes either a File open for reading or a ubyte[] array
409  * containing input data. Data is read a buffer at a time. The buffer can be
410  * user provided, or allocated by inputSourceByChunk based on a caller provided
411  * buffer size.
412  *
413  * A ubyte[] input source must satisfy isBufferableInputSource, which at present
414  * means that it is a dynamic, mutable ubyte[].
415  *
416  * The chunks are returned as an input range.
417  */
418 
419 auto inputSourceByChunk(InputSource)(InputSource source, size_t size)
420 {
421     return inputSourceByChunk(source, new ubyte[](size));
422 }
423 
424 /// Ditto
425 auto inputSourceByChunk(InputSource)(InputSource source, ubyte[] buffer)
426 if (isBufferableInputSource!InputSource)
427 {
428     static if (isFileHandle!(Unqual!InputSource))
429     {
430         return source.byChunk(buffer);
431     }
432     else
433     {
434         static struct BufferedChunk
435         {
436             private Chunks!InputSource _chunks;
437             private ubyte[] _buffer;
438 
439             private void readNextChunk()
440             {
441                 if (_chunks.empty)
442                 {
443                     _buffer.length = 0;
444                 }
445                 else
446                 {
447                     size_t len = _chunks.front.length;
448                     _buffer[0 .. len] = _chunks.front[];
449                     _chunks.popFront;
450 
451                     /* Only the last chunk should be shorter than the buffer. */
452                     assert(_buffer.length == len || _chunks.empty);
453 
454                     if (_buffer.length != len) _buffer.length = len;
455                 }
456             }
457 
458             this(InputSource source, ubyte[] buffer)
459             {
460                 enforce(buffer.length > 0, "buffer size must be larger than 0");
461                 _chunks = source.chunks(buffer.length);
462                 _buffer = buffer;
463                 readNextChunk();
464             }
465 
466             @property bool empty()
467             {
468                 return (_buffer.length == 0);
469             }
470 
471             @property ubyte[] front()
472             {
473                 assert(!empty, "Attempting to fetch the front of an empty inputSourceByChunks");
474                 return _buffer;
475             }
476 
477             void popFront()
478             {
479                 assert(!empty, "Attempting to popFront an empty inputSourceByChunks");
480                 readNextChunk();
481             }
482         }
483 
484         return BufferedChunk(source, buffer);
485     }
486 }
487 
488 unittest  // inputSourceByChunk
489 {
490     import tsv_utils.common.unittest_utils;   // tsv-utils unit test helpers
491     import std.file : mkdir, rmdirRecurse;
492     import std.path : buildPath;
493 
494     auto testDir = makeUnittestTempDir("csv2tsv_inputSourceByChunk");
495     scope(exit) testDir.rmdirRecurse;
496 
497     import std.algorithm : equal, joiner;
498     import std.format;
499     import std.string : representation;
500 
501     auto charData = "abcde,ßÀß,あめりか物語,012345";
502     ubyte[] ubyteData = charData.dup.representation;
503 
504     ubyte[1024] rawBuffer;  // Must be larger than largest bufferSize in tests.
505 
506     void writeFileData(string filePath, ubyte[] data)
507     {
508         import std.stdio;
509 
510         auto f = filePath.File("w");
511         f.rawWrite(data);
512         f.close;
513     }
514 
515     foreach (size_t dataSize; 0 .. ubyteData.length)
516     {
517         auto data = ubyteData[0 .. dataSize];
518         auto filePath = buildPath(testDir, format("data_%d.txt", dataSize));
519         writeFileData(filePath, data);
520 
521         foreach (size_t bufferSize; 1 .. dataSize + 2)
522         {
523             assert(data.inputSourceByChunk(bufferSize).joiner.equal(data),
524                    format("[Test-A] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
525 
526             assert (rawBuffer.length >= bufferSize);
527 
528             ubyte[] buffer = rawBuffer[0 .. bufferSize];
529             assert(data.inputSourceByChunk(buffer).joiner.equal(data),
530                    format("[Test-B] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
531 
532             {
533                 auto inputStream = filePath.File;
534                 assert(inputStream.inputSourceByChunk(bufferSize).joiner.equal(data),
535                        format("[Test-C] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
536                 inputStream.close;
537             }
538 
539             {
540                 auto inputStream = filePath.File;
541                 assert(inputStream.inputSourceByChunk(buffer).joiner.equal(data),
542                        format("[Test-D] dataSize: %d, bufferSize: %d", dataSize, bufferSize));
543                 inputStream.close;
544             }
545         }
546     }
547 }
548 
549 /** Read CSV from an input source, covert to TSV and write to an output source.
550  *
551  * Params:
552  *   inputSource           =  A "bufferable" input source, either a file open for
553  *                            read, or a dynamic, mutable ubyte array.
554  *   outputStream          =  An output range to write TSV bytes to.
555  *   readBuffer            =  A buffer to use for reading.
556  *   filename              =  Name of file to use when reporting errors. A descriptive
557  *                            name can be used in lieu of a file name.
558  *   skipLines             =  Number of lines to skip before outputting records.
559  *                            Typically used to skip writing header lines.
560  *   csvQuote              =  The quoting character used in the CSV input.
561  *   csvDelim              =  The field delimiter character used in the CSV input.
562  *   tsvDelim              =  The field delimiter character to use in the TSV output.
563  *   tsvDelimReplacement   =  String to use when replacing TSV field delimiters
564  *                            (e.g. TABs) found in the CSV data fields.
565  *   tsvNewlineReplacement =  String to use when replacing newlines found in the CSV
566  *                            data fields.
567  *   discardBOM            =  If true (the default), a UTF-8 Byte Order Mark found at the
568  *                            start of the input stream will be dropped.
569  *
570  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
571  *         line number where the error was identified.
572  */
573 void csv2tsv(InputSource, OutputRange)(
574     InputSource inputSource,
575     auto ref OutputRange outputStream,
576     ubyte[] readBuffer,
577     string filename = "(none)",
578     size_t skipLines = 0,
579     const char csvQuote = '"',
580     const char csvDelim = ',',
581     const char tsvDelim = '\t',
582     const string tsvDelimReplacement = " ",
583     const string tsvNewlineReplacement = " ",
584     bool discardBOM = true,
585 )
586 if (isBufferableInputSource!InputSource &&
587     isOutputRange!(OutputRange, char))
588 {
589     import std.conv: hexString;
590 
591     assert (readBuffer.length >= 1);
592 
593     enum char LF = '\n';
594     enum char CR = '\r';
595 
596     enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf";
597 
598     /* Process state information - These variables are defined either in the outer
599      * context or within one of the foreach loops.
600      *
601      *   * recordNum - The current CSV input line/record number. Starts at one.
602      *   * fieldNum - Field number in the current line/record. Field numbers are
603      *     one-upped. The field number set to zero at the start of a new record,
604      *     prior to processing the first character of the first field on the record.
605      *   * byteIndex - Read buffer index of the current byte being processed.
606      *   * csvState - The current state of CSV processing. In particular, the state
607      *     of the finite state machine.
608      *   * writeRegionStart - Read buffer index where the next write starts from.
609      *   * nextIndex - The index of the current input ubyte being processed. The
610      *     current write region extends from the writeRegionStart to nextIndex.
611      *   * nextChar - The current input ubyte. The ubyte/char at nextIndex.
612      */
613 
614     enum CSVState
615     {
616      FieldEnd,           // Start of input or after consuming a field or record delimiter.
617      NonQuotedField,     // Processing a non-quoted field
618      QuotedField,        // Processing a quoted field
619      QuoteInQuotedField, // Last char was a quote in a quoted field
620      CRAtFieldEnd,       // Last char was a CR terminating a record/line
621      CRInQuotedField,    // Last char was a CR in a quoted field
622     }
623 
624     CSVState csvState = CSVState.FieldEnd;
625     size_t recordNum = 1;
626     size_t fieldNum = 0;
627 
628     foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate)
629     {
630         size_t writeRegionStart = 0;
631 
632         /* Discard byte order marks at the start of input.
633          * Note: Slicing the chunk in this fashion generates very good code, better
634          * other approaches like manipulating indices.
635          */
636         auto inputChunk =
637             (discardBOM &&
638              chunkIndex == 0 &&
639              inputChunkComplete.length >= UTF8_BOM.length &&
640              inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM
641             )
642             ? inputChunkComplete[UTF8_BOM.length .. $]
643             : inputChunkComplete[];
644 
645         /* flushCurrentRegion flushes the current write region and moves the start of
646          * the next write region one byte past the end of the current region. If
647          * appendChars are provided they are ouput as well.
648          *
649          * This routine is called when the current character (byte) terminates the
650          * current write region and should not itself be output. That is why the next
651          * write region always starts one byte past the current region end.
652          *
653          * This routine is also called when the 'skiplines' region has been processed.
654          * This is done to flush the region without actually writing it. This is done
655          * by explicit checks in the finite state machine when newline characters
656          * that terminate a record are processed. It would be nice to refactor this.
657          */
658         void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "")
659         {
660             assert(regionEnd <= inputChunk.length);
661 
662             if (recordNum > skipLines)
663             {
664                 if (regionEnd > writeRegionStart)
665                 {
666                     outputStream.put(inputChunk[writeRegionStart .. regionEnd]);
667                 }
668                 if (appendChars.length > 0)
669                 {
670                     outputStream.put(appendChars);
671                 }
672             }
673 
674             writeRegionStart = regionEnd + 1;
675         }
676 
677         foreach (size_t nextIndex, char nextChar; inputChunk)
678         {
679         OuterSwitch: final switch (csvState)
680             {
681             case CSVState.FieldEnd:
682                 /* Start of input or after consuming a field terminator. */
683                 ++fieldNum;
684 
685                 /* Note: Can't use switch due to the 'goto case' to the OuterSwitch.  */
686                 if (nextChar == csvQuote)
687                 {
688                     flushCurrentRegion(nextIndex);
689                     csvState = CSVState.QuotedField;
690                     break OuterSwitch;
691                 }
692                 else
693                 {
694                     /* Processing state change only. Don't consume the character. */
695                     csvState = CSVState.NonQuotedField;
696                     goto case CSVState.NonQuotedField;
697                 }
698 
699             case CSVState.NonQuotedField:
700                 switch (nextChar)
701                 {
702                 default:
703                     break OuterSwitch;
704                 case csvDelim:
705                     inputChunk[nextIndex] = tsvDelim;
706                     csvState = CSVState.FieldEnd;
707                     break OuterSwitch;
708                 case LF:
709                     if (recordNum == skipLines) flushCurrentRegion(nextIndex);
710                     ++recordNum;
711                     fieldNum = 0;
712                     csvState = CSVState.FieldEnd;
713                     break OuterSwitch;
714                 case CR:
715                     inputChunk[nextIndex] = LF;
716                     if (recordNum == skipLines) flushCurrentRegion(nextIndex);
717                     ++recordNum;
718                     fieldNum = 0;
719                     csvState = CSVState.CRAtFieldEnd;
720                     break OuterSwitch;
721                 case tsvDelim:
722                     if (tsvDelimReplacement.length == 1)
723                     {
724                         inputChunk[nextIndex] = tsvDelimReplacement[0];
725                     }
726                     else
727                     {
728                         flushCurrentRegion(nextIndex, tsvDelimReplacement);
729                     }
730                     break OuterSwitch;
731                 }
732 
733             case CSVState.QuotedField:
734                 switch (nextChar)
735                 {
736                 default:
737                     break OuterSwitch;
738                 case csvQuote:
739                     /*
740                      * Flush the current region, without the double quote. Switch state
741                      * to QuoteInQuotedField, which determines whether to output a quote.
742                      */
743                     flushCurrentRegion(nextIndex);
744                     csvState = CSVState.QuoteInQuotedField;
745                     break OuterSwitch;
746 
747                 case tsvDelim:
748                     if (tsvDelimReplacement.length == 1)
749                     {
750                         inputChunk[nextIndex] = tsvDelimReplacement[0];
751                     }
752                     else
753                     {
754                         flushCurrentRegion(nextIndex, tsvDelimReplacement);
755                     }
756                     break OuterSwitch;
757                 case LF:
758                     /* Newline in a quoted field. */
759                     if (tsvNewlineReplacement.length == 1)
760                     {
761                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
762                     }
763                     else
764                     {
765                         flushCurrentRegion(nextIndex, tsvNewlineReplacement);
766                     }
767                     break OuterSwitch;
768                 case CR:
769                     /* Carriage Return in a quoted field. */
770                     if (tsvNewlineReplacement.length == 1)
771                     {
772                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
773                     }
774                     else
775                     {
776                         flushCurrentRegion(nextIndex, tsvNewlineReplacement);
777                     }
778                     csvState = CSVState.CRInQuotedField;
779                     break OuterSwitch;
780                 }
781 
782             case CSVState.QuoteInQuotedField:
783                 /* Just processed a quote in a quoted field. The buffer, without the
784                  * quote, was just flushed. Only legal characters here are quote,
785                  * comma (field delimiter), newline (record delimiter).
786                  */
787                 switch (nextChar)
788                 {
789                 case csvQuote:
790                     csvState = CSVState.QuotedField;
791                     break OuterSwitch;
792                 case csvDelim:
793                     inputChunk[nextIndex] = tsvDelim;
794                     csvState = CSVState.FieldEnd;
795                     break OuterSwitch;
796                 case LF:
797                     if (recordNum == skipLines) flushCurrentRegion(nextIndex);
798                     ++recordNum;
799                     fieldNum = 0;
800                     csvState = CSVState.FieldEnd;
801                     break OuterSwitch;
802                 case CR:
803                     inputChunk[nextIndex] = LF;
804                     if (recordNum == skipLines) flushCurrentRegion(nextIndex);
805                     ++recordNum;
806                     fieldNum = 0;
807                     csvState = CSVState.CRAtFieldEnd;
808                     break OuterSwitch;
809                 default:
810                     throw new Exception(
811                         format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
812                                (filename == "-") ? "Standard Input" : filename,
813                                recordNum));
814                 }
815 
816             case CSVState.CRInQuotedField:
817                 if (nextChar == LF)
818                 {
819                     flushCurrentRegion(nextIndex);
820                     csvState = CSVState.QuotedField;
821                     break OuterSwitch;
822                 }
823                 else {
824                     /* Naked CR. State change only, don't consume current character. */
825                     csvState = CSVState.QuotedField;
826                     goto case CSVState.QuotedField;
827                 }
828 
829             case CSVState.CRAtFieldEnd:
830                 if (nextChar == LF)
831                 {
832                     flushCurrentRegion(nextIndex);
833                     csvState = CSVState.FieldEnd;
834                     break OuterSwitch;
835                 }
836                 else {
837                     /* Naked CR. State change only, don't consume current character. */
838                     csvState = CSVState.FieldEnd;
839                     goto case CSVState.FieldEnd;
840                 }
841             }
842         }
843 
844         /* End of buffer. */
845         if (writeRegionStart < inputChunk.length && recordNum > skipLines)
846         {
847             outputStream.put(inputChunk[writeRegionStart .. $]);
848         }
849 
850         writeRegionStart = 0;
851     }
852 
853     enforce(csvState != CSVState.QuotedField,
854             format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
855                    (filename == "-") ? "Standard Input" : filename,
856                    recordNum));
857 
858     /* Output a newline if the CSV input did not have a terminating newline. */
859     if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n');
860 }
861 
862 unittest
863 {
864     /* Unit tests for the csv2tsv function.
865      *
866      * These unit tests exercise different CSV combinations and escaping cases. The CSV
867      * data content is the same for each corresponding test string, except the delimiters
868      * have been changed. e.g csv6a and csv6b have the same data content.
869      *
870      * A property used in these tests is that changing the CSV delimiters doesn't change
871      * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
872      * as TSV doesn't support having it's delimiters in the data. This allows having a
873      * single TSV expected set that is generated by CSVs with different delimter sets.
874      *
875      * This test set does not test main, file handling, or error messages. These are
876      * handled by tests run against the executable.
877      *
878      * Note: unittest is non @safe due to the casts from string to ubyte[]. This can
879      * probably be rewritten to use std.string.representation instead, which is @safe.
880      */
881 
882     /* Default CSV. */
883     auto csv1a = "a,b,c";
884     auto csv2a = "a,bc,,,def";
885     auto csv3a = ",a, b , cd ,";
886     auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
887     auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
888     auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
889     auto csv7a = "\",\",\",,\",\",,,\"";
890     auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
891     auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
892     auto csv10a = "";
893     auto csv11a = ",";
894     auto csv12a = ",,";
895     auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
896     auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
897     auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
898     auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
899     auto csv17a = "ab\",ab\"cd";
900     auto csv18a = "\n\n\n";
901     auto csv19a = "\t";
902     auto csv20a = "\t\t";
903     auto csv21a = "a\n";
904     auto csv22a = "a,\n";
905     auto csv23a = "a,b\n";
906     auto csv24a = ",\n";
907     auto csv25a = "#";
908     auto csv26a = "^";
909     auto csv27a = "#^#";
910     auto csv28a = "^#^";
911     auto csv29a = "$";
912     auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
913     auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
914     auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
915 
916     // Newlines terminating a line ending a non-quoted field
917     auto csv33a = "\rX\r\nX\n\r\nX\r\n";
918 
919     // Newlines inside a quoted field and terminating a line following a quoted field
920     auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n";
921 
922     // CR at field end
923     auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r";
924 
925     /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
926     auto csv1b = "a^b^c";
927     auto csv2b = "a^bc^^^def";
928     auto csv3b = "^a^ b ^ cd ^";
929     auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
930     auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
931     auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
932     auto csv7b = "#,#^#,,#^#,,,#";
933     auto csv8b = "##^#\"#^#\"\"#";
934     auto csv9b = "#ab, de\tfg\"\nhij#";
935     auto csv10b = "";
936     auto csv11b = "^";
937     auto csv12b = "^^";
938     auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
939     auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
940     auto csv15b = "#ab, de\tfg\"\rhij#";
941     auto csv16b = "#ab, de\tfg\"\r\nhij#";
942     auto csv17b = "ab\"^ab\"cd";
943     auto csv18b = "\n\n\n";
944     auto csv19b = "\t";
945     auto csv20b = "\t\t";
946     auto csv21b = "a\n";
947     auto csv22b = "a^\n";
948     auto csv23b = "a^b\n";
949     auto csv24b = "^\n";
950     auto csv25b = "####";
951     auto csv26b = "#^#";
952     auto csv27b = "###^###";
953     auto csv28b = "#^##^#";
954     auto csv29b = "$";
955     auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
956     auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
957     auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
958     auto csv33b = "\rX\r\nX\n\r\nX\r\n";
959     auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n";
960     auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r";
961 
962     /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
963     auto tsv1 = "a\tb\tc\n";
964     auto tsv2 = "a\tbc\t\t\tdef\n";
965     auto tsv3 = "\ta\t b \t cd \t\n";
966     auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
967     auto tsv5 = " \t  \t   \n";
968     auto tsv6 = " \t  \t   \n";
969     auto tsv7 = ",\t,,\t,,,\n";
970     auto tsv8 = "\t\"\t\"\"\n";
971     auto tsv9 = "ab, de fg\" hij\n";
972     auto tsv10 = "";
973     auto tsv11 = "\t\n";
974     auto tsv12 = "\t\t\n";
975     auto tsv13 = " \t  \t   \n";
976     auto tsv14 = " \t  \t   \n";
977     auto tsv15 = "ab, de fg\" hij\n";
978     auto tsv16 = "ab, de fg\" hij\n";
979     auto tsv17 = "ab\"\tab\"cd\n";
980     auto tsv18 = "\n\n\n";
981     auto tsv19 = " \n";
982     auto tsv20 = "  \n";
983     auto tsv21 = "a\n";
984     auto tsv22 = "a\t\n";
985     auto tsv23 = "a\tb\n";
986     auto tsv24 = "\t\n";
987     auto tsv25 = "#\n";
988     auto tsv26 = "^\n";
989     auto tsv27 = "#^#\n";
990     auto tsv28 = "^#^\n";
991     auto tsv29 = "$\n";
992     auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
993     auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
994     auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
995     auto tsv33 = "\nX\nX\n\nX\n";
996     auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n";
997     auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
998 
999     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
1000      * This will also result in different replacements when TAB and $ appear in the CSV.
1001      */
1002     auto tsv1_x = "a$b$c\n";
1003     auto tsv2_x = "a$bc$$$def\n";
1004     auto tsv3_x = "$a$ b $ cd $\n";
1005     auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
1006     auto tsv5_x = " $  $   \n";
1007     auto tsv6_x = "\t$\t\t$\t\t\t\n";
1008     auto tsv7_x = ",$,,$,,,\n";
1009     auto tsv8_x = "$\"$\"\"\n";
1010     auto tsv9_x = "ab, de\tfg\" hij\n";
1011     auto tsv10_x = "";
1012     auto tsv11_x = "$\n";
1013     auto tsv12_x = "$$\n";
1014     auto tsv13_x = " $  $   \n";
1015     auto tsv14_x = " $  $   \n";
1016     auto tsv15_x = "ab, de\tfg\" hij\n";
1017     auto tsv16_x = "ab, de\tfg\" hij\n";
1018     auto tsv17_x = "ab\"$ab\"cd\n";
1019     auto tsv18_x = "\n\n\n";
1020     auto tsv19_x = "\t\n";
1021     auto tsv20_x = "\t\t\n";
1022     auto tsv21_x = "a\n";
1023     auto tsv22_x = "a$\n";
1024     auto tsv23_x = "a$b\n";
1025     auto tsv24_x = "$\n";
1026     auto tsv25_x = "#\n";
1027     auto tsv26_x = "^\n";
1028     auto tsv27_x = "#^#\n";
1029     auto tsv28_x = "^#^\n";
1030     auto tsv29_x = " \n";
1031     auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
1032     auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
1033     auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
1034     auto tsv33_x = "\nX\nX\n\nX\n";
1035     auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n";
1036     auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n";
1037 
1038     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
1039      * and with the delimiter/newline replacement string being |--|. Basically, newlines
1040      * and '$' in the original data are replaced by |--|.
1041      */
1042     auto tsv1_y = "a$b$c\n";
1043     auto tsv2_y = "a$bc$$$def\n";
1044     auto tsv3_y = "$a$ b $ cd $\n";
1045     auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
1046     auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
1047     auto tsv6_y = "\t$\t\t$\t\t\t\n";
1048     auto tsv7_y = ",$,,$,,,\n";
1049     auto tsv8_y = "$\"$\"\"\n";
1050     auto tsv9_y = "ab, de\tfg\"|--|hij\n";
1051     auto tsv10_y = "";
1052     auto tsv11_y = "$\n";
1053     auto tsv12_y = "$$\n";
1054     auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
1055     auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
1056     auto tsv15_y = "ab, de\tfg\"|--|hij\n";
1057     auto tsv16_y = "ab, de\tfg\"|--|hij\n";
1058     auto tsv17_y = "ab\"$ab\"cd\n";
1059     auto tsv18_y = "\n\n\n";
1060     auto tsv19_y = "\t\n";
1061     auto tsv20_y = "\t\t\n";
1062     auto tsv21_y = "a\n";
1063     auto tsv22_y = "a$\n";
1064     auto tsv23_y = "a$b\n";
1065     auto tsv24_y = "$\n";
1066     auto tsv25_y = "#\n";
1067     auto tsv26_y = "^\n";
1068     auto tsv27_y = "#^#\n";
1069     auto tsv28_y = "^#^\n";
1070     auto tsv29_y = "|--|\n";
1071     auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
1072     auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
1073     auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
1074     auto tsv33_y = "\nX\nX\n\nX\n";
1075     auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n";
1076     auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n";
1077 
1078     /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB|
1079      * and newline replacement |NL|.
1080      */
1081     auto tsv1_z = "a\tb\tc\n";
1082     auto tsv2_z = "a\tbc\t\t\tdef\n";
1083     auto tsv3_z = "\ta\t b \t cd \t\n";
1084     auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
1085     auto tsv5_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1086     auto tsv6_z = "<TAB>\t<TAB><TAB>\t<TAB><TAB><TAB>\n";
1087     auto tsv7_z = ",\t,,\t,,,\n";
1088     auto tsv8_z = "\t\"\t\"\"\n";
1089     auto tsv9_z = "ab, de<TAB>fg\"<NL>hij\n";
1090     auto tsv10_z = "";
1091     auto tsv11_z = "\t\n";
1092     auto tsv12_z = "\t\t\n";
1093     auto tsv13_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1094     auto tsv14_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
1095     auto tsv15_z = "ab, de<TAB>fg\"<NL>hij\n";
1096     auto tsv16_z = "ab, de<TAB>fg\"<NL>hij\n";
1097     auto tsv17_z = "ab\"\tab\"cd\n";
1098     auto tsv18_z = "\n\n\n";
1099     auto tsv19_z = "<TAB>\n";
1100     auto tsv20_z = "<TAB><TAB>\n";
1101     auto tsv21_z = "a\n";
1102     auto tsv22_z = "a\t\n";
1103     auto tsv23_z = "a\tb\n";
1104     auto tsv24_z = "\t\n";
1105     auto tsv25_z = "#\n";
1106     auto tsv26_z = "^\n";
1107     auto tsv27_z = "#^#\n";
1108     auto tsv28_z = "^#^\n";
1109     auto tsv29_z = "$\n";
1110     auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
1111     auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
1112     auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
1113     auto tsv33_z = "\nX\nX\n\nX\n";
1114     auto tsv34_z = "<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n";
1115     auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
1116 
1117     /* Aggregate the test data into parallel arrays. */
1118     auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
1119                      csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
1120                      csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
1121                      csv31a, csv32a, csv33a, csv34a, csv35a];
1122 
1123     auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
1124                      csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
1125                      csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
1126                      csv31b, csv32b, csv33b, csv34b, csv35b];
1127 
1128     auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
1129                      tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
1130                      tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
1131                      tsv31, tsv32, tsv33, tsv34, tsv35];
1132 
1133     auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
1134                        tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
1135                        tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
1136                        tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x];
1137 
1138     auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
1139                        tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
1140                        tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
1141                        tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y];
1142 
1143     auto tsvSet1_z  = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z,
1144                        tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z,
1145                        tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z,
1146                        tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z];
1147 
1148     /* The tests. */
1149     auto bufferSizeTests = [1, 2, 3, 8, 128];
1150 
1151     foreach (bufferSize; bufferSizeTests)
1152     {
1153         ubyte[] readBuffer = new ubyte[](bufferSize);
1154 
1155         foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z))
1156         {
1157             import std.conv : to;
1158 
1159             /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
1160             ubyte[] csvInputA = cast(ubyte[])csva;
1161             ubyte[] csvInputB = cast(ubyte[])csvb;
1162 
1163             /* CSV Set A vs TSV expected. */
1164             auto tsvResultA = appender!(char[])();
1165             csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV");
1166             assert(tsv == tsvResultA.data,
1167                    format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1168                           i + 1, csva, tsv, tsvResultA.data));
1169 
1170             /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
1171             auto tsvResultB = appender!(char[])();
1172             csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^');
1173             assert(tsv == tsvResultB.data,
1174                    format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1175                           i + 1, csvb, tsv, tsvResultB.data));
1176 
1177             /* CSV Set A and TSV with $ separator.*/
1178             csvInputA = cast(ubyte[])csva;
1179             auto tsvResult_XA = appender!(char[])();
1180             csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$');
1181             assert(tsv_x == tsvResult_XA.data,
1182                    format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1183                           i + 1, csva, tsv_x, tsvResult_XA.data));
1184 
1185             /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
1186             csvInputB = cast(ubyte[])csvb;
1187             auto tsvResult_XB = appender!(char[])();
1188             csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$');
1189             assert(tsv_x == tsvResult_XB.data,
1190                    format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1191                           i + 1, csvb, tsv_x, tsvResult_XB.data));
1192 
1193             /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
1194             csvInputA = cast(ubyte[])csva;
1195             auto tsvResult_YA = appender!(char[])();
1196             csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|");
1197             assert(tsv_y == tsvResult_YA.data,
1198                    format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1199                           i + 1, csva, tsv_y, tsvResult_YA.data));
1200 
1201             /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
1202             csvInputB = cast(ubyte[])csvb;
1203             auto tsvResult_YB = appender!(char[])();
1204             csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|");
1205             assert(tsv_y == tsvResult_YB.data,
1206                    format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1207                           i + 1, csvb, tsv_y, tsvResult_YB.data));
1208 
1209             /* CSV Set A and TSV with TAB replacement as <TAB> and newline replacement as <NL>. Same TSV as CSV Set A.*/
1210             csvInputA = cast(ubyte[])csva;
1211             auto tsvResult_ZA = appender!(char[])();
1212             csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "<TAB>", "<NL>");
1213             assert(tsv_z == tsvResult_ZA.data,
1214                    format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1215                           i + 1, csva, tsv_z, tsvResult_ZA.data));
1216         }
1217     }
1218 }
1219 
1220 // csv2tsv skiplines tests
1221 unittest
1222 {
1223     import std.string : representation;
1224 
1225     auto csv1 = "";
1226     auto csv2 = "a";
1227 
1228     auto csv3 = "\n";
1229     auto csv4 = "\n\n";
1230     auto csv5 = "\n\n\n";
1231 
1232     auto csv6 = "a\n";
1233     auto csv7 = "a\nb\n";
1234     auto csv8 = "a\nb\nc\n";
1235 
1236     auto csv9 = "\"\n\"\n";
1237     auto csv10 = "\"\n\"\n\"\n\"\n";
1238     auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n";
1239 
1240     auto csv12 = "\r";
1241     auto csv13 = "\r\r";
1242     auto csv14 = "\r\r\r";
1243 
1244     auto csv15 = "a\r";
1245     auto csv16 = "a\rb\r";
1246     auto csv17 = "a\rb\rc\r";
1247 
1248     auto csv18 = "\"\r\"\r";
1249     auto csv19 = "\"\r\"\r\"\r\"\r";
1250     auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r";
1251 
1252     auto csv21 = "\r\n";
1253     auto csv22 = "\r\n\r\n";
1254     auto csv23 = "\r\n\r\n\r\n";
1255 
1256     auto csv24 = "a\r\n";
1257     auto csv25 = "a\r\nb\r\n";
1258     auto csv26 = "a\r\nb\r\nc\r\n";
1259 
1260     auto csv27 = "\"\r\n\"\r\n";
1261     auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n";
1262     auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n";
1263 
1264     /* The Skip 1 expected results. */
1265     auto tsv1Skip1 = "";
1266     auto tsv2Skip1 = "";
1267 
1268     auto tsv3Skip1 = "";
1269     auto tsv4Skip1 = "\n";
1270     auto tsv5Skip1 = "\n\n";
1271 
1272     auto tsv6Skip1 = "";
1273     auto tsv7Skip1 = "b\n";
1274     auto tsv8Skip1 = "b\nc\n";
1275 
1276     auto tsv9Skip1 = "";
1277     auto tsv10Skip1 = " \n";
1278     auto tsv11Skip1 = " \n \n";
1279 
1280     auto tsv12Skip1 = "";
1281     auto tsv13Skip1 = "\n";
1282     auto tsv14Skip1 = "\n\n";
1283 
1284     auto tsv15Skip1 = "";
1285     auto tsv16Skip1 = "b\n";
1286     auto tsv17Skip1 = "b\nc\n";
1287 
1288     auto tsv18Skip1 = "";
1289     auto tsv19Skip1 = " \n";
1290     auto tsv20Skip1 = " \n \n";
1291 
1292     auto tsv21Skip1 = "";
1293     auto tsv22Skip1 = "\n";
1294     auto tsv23Skip1 = "\n\n";
1295 
1296     auto tsv24Skip1 = "";
1297     auto tsv25Skip1 = "b\n";
1298     auto tsv26Skip1 = "b\nc\n";
1299 
1300     auto tsv27Skip1 = "";
1301     auto tsv28Skip1 = " \n";
1302     auto tsv29Skip1 = " \n \n";
1303 
1304     /* The Skip 2 expected results. */
1305     auto tsv1Skip2 = "";
1306     auto tsv2Skip2 = "";
1307 
1308     auto tsv3Skip2 = "";
1309     auto tsv4Skip2 = "";
1310     auto tsv5Skip2 = "\n";
1311 
1312     auto tsv6Skip2 = "";
1313     auto tsv7Skip2 = "";
1314     auto tsv8Skip2 = "c\n";
1315 
1316     auto tsv9Skip2 = "";
1317     auto tsv10Skip2 = "";
1318     auto tsv11Skip2 = " \n";
1319 
1320     auto tsv12Skip2 = "";
1321     auto tsv13Skip2 = "";
1322     auto tsv14Skip2 = "\n";
1323 
1324     auto tsv15Skip2 = "";
1325     auto tsv16Skip2 = "";
1326     auto tsv17Skip2 = "c\n";
1327 
1328     auto tsv18Skip2 = "";
1329     auto tsv19Skip2 = "";
1330     auto tsv20Skip2 = " \n";
1331 
1332     auto tsv21Skip2 = "";
1333     auto tsv22Skip2 = "";
1334     auto tsv23Skip2 = "\n";
1335 
1336     auto tsv24Skip2 = "";
1337     auto tsv25Skip2 = "";
1338     auto tsv26Skip2 = "c\n";
1339 
1340     auto tsv27Skip2 = "";
1341     auto tsv28Skip2 = "";
1342     auto tsv29Skip2 = " \n";
1343 
1344     auto csvSet =
1345         [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10,
1346          csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20,
1347          csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29];
1348 
1349     auto tsvSkip1Set =
1350         [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1,
1351          tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1,
1352          tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1];
1353 
1354     auto tsvSkip2Set =
1355         [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2,
1356          tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2,
1357          tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2];
1358 
1359     auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1360 
1361     foreach (bufferSize; bufferSizeTests)
1362     {
1363         ubyte[] readBuffer = new ubyte[](bufferSize);
1364 
1365         foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set))
1366         {
1367             ubyte[] csvInput = csv.dup.representation;
1368             auto csvToTSVSkip1 = appender!(char[])();
1369             auto csvToTSVSkip2 = appender!(char[])();
1370 
1371             csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1);
1372 
1373             assert(tsvSkip1 == csvToTSVSkip1.data,
1374                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1375                           i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data));
1376 
1377             csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2);
1378 
1379             assert(tsvSkip2 == csvToTSVSkip2.data,
1380                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1381                           i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data));
1382         }
1383     }
1384 }
1385 
1386 // csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe
1387 unittest
1388 {
1389     import std.conv : hexString;
1390     import std.string : representation;
1391 
1392     enum utf8BOM = hexString!"efbbbf";
1393 
1394     auto csv1 = "";
1395     auto csv2 = "a";
1396     auto csv3 = "ab";
1397     auto csv4 = "a,b";
1398     auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz";
1399 
1400     auto csv1BOM = utf8BOM ~ csv1;
1401     auto csv2BOM = utf8BOM ~ csv2;
1402     auto csv3BOM = utf8BOM ~ csv3;
1403     auto csv4BOM = utf8BOM ~ csv4;
1404     auto csv5BOM = utf8BOM ~ csv5;
1405 
1406     auto tsv1 = "";
1407     auto tsv2 = "a\n";
1408     auto tsv3 = "ab\n";
1409     auto tsv4 = "a\tb\n";
1410     auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n";
1411 
1412     /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline.
1413      * However, with the BOM prepended the tsv gets a trailing newline.
1414      */
1415     auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n";
1416     auto tsv2BOM = utf8BOM ~ tsv2;
1417     auto tsv3BOM = utf8BOM ~ tsv3;
1418     auto tsv4BOM = utf8BOM ~ tsv4;
1419     auto tsv5BOM = utf8BOM ~ tsv5;
1420 
1421     auto csvSet = [csv1, csv2, csv3, csv4, csv5];
1422     auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM];
1423 
1424     auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5];
1425     auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM];
1426 
1427     auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1428 
1429     foreach (bufferSize; bufferSizeTests)
1430     {
1431         ubyte[] readBuffer = new ubyte[](bufferSize);
1432 
1433         foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet))
1434         {
1435             ubyte[] csvInput = csv.dup.representation;
1436             ubyte[] csvBOMInput = csvBOM.dup.representation;
1437 
1438             auto csvToTSV = appender!(char[])();
1439             auto csvToTSV_NoBOMRemoval = appender!(char[])();
1440             auto csvBOMToTSV = appender!(char[])();
1441             auto csvBOMToTSV_NoBOMRemoval = appender!(char[])();
1442 
1443             csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true);
1444             assert(tsv == csvToTSV.data,
1445                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1446                           i + 1, bufferSize, csv, tsv, csvToTSV.data));
1447 
1448             csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1449             assert(tsv == csvToTSV_NoBOMRemoval.data,
1450                    format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1451                           i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data));
1452 
1453             csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true);
1454             if (readBuffer.length < utf8BOM.length)
1455             {
1456                 /* Removing BOMs, but didn't provide enough buffer, so no removal. */
1457                 assert(tsvBOM == csvBOMToTSV.data,
1458                        format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1459                               i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1460             }
1461             else
1462             {
1463                 assert(tsv == csvBOMToTSV.data,
1464                        format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1465                               i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1466             }
1467 
1468             csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1469             assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data,
1470                    format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1471                           i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data));
1472         }
1473     }
1474 }