1 /**
2 Convert CSV formatted data to TSV format.
4 This program converts comma-separated value data to tab-separated format.
6 Copyright (c) 2016-2021, eBay Inc.
7 Initially written by Jon Degenhardt
9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
12 module tsv_utils.csv2tsv;
14 import std.stdio;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.traits : isArray, Unqual;
19 import std.typecons : tuple;
20 import tsv_utils.common.utils : isBufferableInputSource, inputSourceByChunk;
22 immutable helpText = q"EOS
23 Synopsis: csv2tsv [options] [file...]
25 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
26 are read from files or standard input, converted records written to standard output.
27 Use '--help-verbose' for details the CSV formats accepted.
29 Options:
30 EOS";
32 immutable helpTextVerbose = q"EOS
33 Synopsis: csv2tsv [options] [file...]
35 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
36 are read from files or standard input, converted records written to standard output.
38 Both formats represent tabular data, each record on its own line, fields separated
39 by a delimiter character. The key difference is that CSV uses escape sequences to
40 represent newlines and field separators in the data, whereas TSV disallows these
41 characters in the data. The most common field delimiters are comma for CSV and tab
42 for TSV, but any character can be used.
44 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
45 and replacing newlines and field delimiters in the data. By default, newlines and
46 field delimiters in the data are replaced by spaces. Most details are customizable.
48 There is no single spec for CSV, any number of variants can be found. The escape
49 syntax is common enough: fields containing newlines or field delimiters are placed
50 in double quotes. Inside a quoted field, a double quote is represented by a pair of
51 double quotes. As with field separators, the quoting character is customizable.
53 Behaviors of this program that often vary between CSV implementations:
54   * Newlines are supported in quoted fields.
55   * Double quotes are permitted in a non-quoted field. However, a field starting
56     with a quote must follow quoting rules.
57   * Each record can have a different number of fields.
58   * The three common forms of newlines are supported: CR, CRLF, LF. Output is
59     written using Unix newlines (LF).
60   * A newline will be added if the file does not end with one.
61   * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed.
62   * No whitespace trimming is done.
64 This program does not validate CSV correctness, but will terminate with an error
65 upon reaching an inconsistent state. Improperly terminated quoted fields are the
66 primary cause.
68 UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
70 Options:
71 EOS";
73 /** Container for command line options.
74  */
75 struct Csv2tsvOptions
76 {
77     string programName;
78     bool helpVerbose = false;          // --help-verbose
79     bool hasHeader = false;            // --H|header
80     char csvQuoteChar = '"';           // --q|quote
81     char csvDelimChar = ',';           // --c|csv-delim
82     char tsvDelimChar = '\t';          // --t|tsv-delim
83     string tsvDelimReplacement = " ";  // --r|tab-replacement
84     string newlineReplacement = " ";   // --n|newline-replacement
85     bool versionWanted = false;        // --V|version
87     auto processArgs (ref string[] cmdArgs)
88     {
89         import std.algorithm : canFind;
90         import std.getopt;
91         import std.path : baseName, stripExtension;
93         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
95         try
96         {
97             auto r = getopt(
98                 cmdArgs,
99                 "help-verbose",          "     Print full help.", &helpVerbose,
100                 std.getopt.config.caseSensitive,
101                 "H|header",              "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
102                 std.getopt.config.caseSensitive,
103                 "q|quote",               "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
104                 "c|csv-delim",           "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
105                 "t|tsv-delim",           "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
106                 "r|tab-replacement",     "STR  Replacement for TSV field delimiters (typically TABs) found in CSV input. Default: Space.", &tsvDelimReplacement,
107                 "n|newline-replacement", "STR  Replacement for newlines found in CSV input. Default: Space.", &newlineReplacement,
108                 std.getopt.config.caseSensitive,
109                 "V|version",             "     Print version information and exit.", &versionWanted,
110                 std.getopt.config.caseInsensitive,
111                 );
113             if (r.helpWanted)
114             {
115                 defaultGetoptPrinter(helpText, r.options);
116                 return tuple(false, 0);
117             }
118             else if (helpVerbose)
119             {
120                 defaultGetoptPrinter(helpTextVerbose, r.options);
121                 return tuple(false, 0);
122             }
123             else if (versionWanted)
124             {
125                 import tsv_utils.common.tsvutils_version;
126                 writeln(tsvutilsVersionNotice("csv2tsv"));
127                 return tuple(false, 0);
128             }
130             /* Consistency checks. */
131             enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r',
132                     "CSV quote character cannot be newline (--q|quote).");
134             enforce(csvQuoteChar != csvDelimChar,
135                     "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
137             enforce(csvQuoteChar != tsvDelimChar,
138                     "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
140             enforce(csvDelimChar != '\n' && csvDelimChar != '\r',
141                     "CSV field delimiter cannot be newline (--c|csv-delim).");
143             enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r',
144                     "TSV field delimiter cannot be newline (--t|tsv-delim).");
146             enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement),
147                     "Replacement character cannot contain newlines or TSV field delimiters (--r|tab-replacement).");
149             enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(newlineReplacement),
150                     "Replacement character cannot contain newlines or TSV field delimiters (--n|newline-replacement).");
151         }
152         catch (Exception exc)
153         {
154             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
155             return tuple(false, 1);
156         }
157         return tuple(true, 0);
158     }
159 }
161 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
163 version(unittest)
164 {
165     // No main in unittest
166 }
167 else
168 {
169     int main(string[] cmdArgs)
170     {
171         /* When running in DMD code coverage mode, turn on report merging. */
172         version(D_Coverage) version(DigitalMars)
173         {
174             import core.runtime : dmd_coverSetMerge;
175             dmd_coverSetMerge(true);
176         }
178         Csv2tsvOptions cmdopt;
179         const r = cmdopt.processArgs(cmdArgs);
180         if (!r[0]) return r[1];
181         version(LDC_Profile)
182         {
183             import ldc.profile : resetAll;
184             resetAll();
185         }
186         try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
187         catch (Exception exc)
188         {
189             writeln();
190             stdout.flush();
191             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
192             return 1;
193         }
195         return 0;
196     }
197 }
199 /** csv2tsvFiles takes a list of input files and passes each to csv2tsv, which
200  * runs on a single file. csv2tsvFiles manages header lines and sets up the
201  * BufferedOutputRange passed to csv2tsv.
202  */
203 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles)
204 {
205     import tsv_utils.common.utils : BufferedOutputRange;
207     /* Buffer Sizes
208      *
209      * ReadBufferSize is the typical size used for buffered reads by most tsv-utils
210      * programs. Nothing unusal there. However, the default sizes used by
211      * BufferedOutputRange are overridden to allocate a larger initial buffer (the
212      * reserve size) and to ensure buffers are flushed to standard output more
213      * quickly (the max size).
214      *
215      * BufferedOutputRange is intended primarily for record oriented writes, where
216      * output ends in newlines. When given a string ending in a newline, the buffer
217      * is flushed if it is greater than 'flush size'. Otherwise buffers are flushed
218      * after exceeding 'max size'.
219      *
220      * For csv2tsv's buffered conversion algorithm there are two very different cases:
221      * 1) Extensive use of CSV escapes, where all fields are quoted.
222      * 2) Limited use of CSV escapes, where few fields are quoted.
223      *
224      * The first case will translate to record oriented writes. In particular, if the
225      * first field is quoted, the write to BufferedOutputRange will be on a newline
226      * boundary. (A quoted field pushes accumulated data to BufferedOutputRange.) For
227      * this case, the default flush behavior of BufferedOutputRange works well.
228      *
229      * In the second case, data gets pushed to BufferedOutputRange on arbitrary byte
230      * boundaries. BufferedOutputRange won't flush to standard output until max size
231      * bytes have been accumulated. The default max size is larger than optimal, so
232      * instead max size is set to a size similar to the read buffer size. Reserve
233      * is increased for the same reason.
234      */
235     enum ReadBufferSize = 1024L * 128L;
236     enum OutputBufferFlushSize = 1024L * 10L;
237     enum OutputBufferReserveSize = 1024L * 129L;
238     enum OutputBufferMaxSize = 1024L * 128L;
240     ubyte[ReadBufferSize] fileRawBuf;
241     auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(
242         stdout, OutputBufferFlushSize, OutputBufferReserveSize, OutputBufferMaxSize);
243     bool firstFile = true;
245     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
246     {
247         auto inputStream = (filename == "-") ? stdin : filename.File;
248         auto printFileName = (filename == "-") ? "stdin" : filename;
250         auto skipLines = (firstFile || !cmdopt.hasHeader) ? 0 : 1;
252         csv2tsv(inputStream, stdoutWriter, fileRawBuf, printFileName, skipLines,
253                 cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
254                 cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
255                 cmdopt.newlineReplacement);
257         firstFile = false;
258     }
259 }
261 /* csv2tsv buffered conversion algorithm
263 This version of csv2tsv uses a buffered approach to csv-to-tsv conversion. This is a
264 change from the original version, which used a character-at-a-time approach, with
265 characters coming from an infinite stream of characters. The character-at-a-time
266 approach was nice from a simplicity perspective, but the approach didn't optimize well.
267 Note that the original version read input in blocks and wrote to stdout in blocks, it
268 was the conversion algorithm itself that was character oriented.
270 The idea is to convert a buffer at a time, writing larger blocks to the output stream
271 rather than one character at a time. In addition, the read buffer is modified in-place
272 when the only change is to convert a single character. The notable case is converting
273 the field delimiter character, typically comma to TAB. The result is writing longer
274 blocks to the output stream (BufferedOutputRange).
276 Performance improvements from the new algorithm are notable. This is especially true
277 versus the previous version 2.0.0. Note though that the more recent versions of
278 csv2tsv were slower due to degradations coming from compiler and/or language version.
279 Version 1.1.19 was quite a bit faster. Regardless of version, the performance
280 improvement is especially good when run against "simple" CSV files, with limited
281 amounts of CSV escape syntax. In these files the main change is converting the field
282 delimiter character, typically comma to TAB.
284 In some benchmarks on Mac OS, the new version was 40% faster than csv2tsv 2.0.0 on
285 files with significant CSV escapes, and 60% faster on files with limited CSV escapes.
286 Versus csv2tsv version 1.1.19, the new version is 10% and 40% faster on the same
287 files. On the "simple CSV" file, where Unix 'tr' is an option, 'tr' was still faster,
288 by about 20%. But getting into the 'tr' ballpark while retaining safety of correct
289 csv2tsv conversion is a good result.
291 Algorithm notes:
293 The algorithm works by reading an input block, then examining each byte in-order to
294 identify needed modifications. The region of consecutive characters without a change
295 is tracked. Single character changes are done in-place, in the read buffer. This
296 allows assembling longer blocks before write is needed. The region being tracked is
297 written to the output stream when it can no longer be extended in a continuous
298 fashion. At this point a new region is started. When the current read buffer has
299 been processed the current region is written out and a new block of data read in.
301 The read buffer uses fixed size blocks. This means the algorithm is actually
302 operating on bytes (UTF-8 code units), and not characters. This works because all
303 delimiters and CSV escape syntax characters are single byte UTF-8 characters. These
304 are the only characters requiring interpretation. The main nuisance is the 2-byte
305 CRLF newline sequence, as this might be split across two read buffers. This is
306 handled by embedding 'CR' states in the finite state machine.
308 Processing CSV escapes will often cause the character removals and additions. These
309 will not be representable in a continuous stream of bytes without moving bytes around
310 Instead of moving bytes, these cases are handled by immediately  writing to the output
311 stream. This allows restarting a new block of contiguous characters. Handling by the
312 new algorithm is described below. Note that the length of the replacement characters
313 for TSV field and record delimiters (e.g. TAB, newline) affects the processing.
315 All replacement character lengths:
317 * Windows newline (CRLF) at the end of a line - Replace the CRLF with LF.
319   Replace the CR with LF, add it to the current write region and terminate it. The
320   next write region starts at the character after the LF.
322 * Double quote starting or ending a field - Drop the double quote.
324   Terminate the current write region, next write region starts at the next character.
326 * Double quote pair inside a quoted field - Drop one of the double quotes.
328   The algorithm drops the first double quote and keep the second. This avoids
329   look-ahead and both field terminating double quote and double quote pair can
330   handled the same way. Terminate the current write region without adding the double
331   quote. The next write region starts at the next character.
333 Single byte replacement characters:
335 * Windows newline (CRLF) in a quoted field
337   Replace the CR with the replacement char, add it to the current write region and
338   terminate it. The next write region starts at the character after the LF.
340 Multi-byte replacement sequences:
342 * TSV Delimiter (TAB by default) in a field
344   Terminate the current write region, write it out and the replacement. The next
345   write region starts at the next character.
347 * LF, CR, or CRLF in a quoted field
349   Terminate the current write region, write it and the replacement. The next write
350   region starts at the next character.
352 csv2tsv API
354 At the API level, it is desirable to handle at both open files and input streams.
355 Open files are the key requirement, but handling input streams simplifies unit
356 testing, and in-memory conversion is likely to be useful anyway. Internally, it
357 should be easy enough to encapsulate the differences between input streams and files.
358 Reading files can be done using File.byChunk and reading from input streams can be
359 done using std.range.chunks.
361 This has been handled by creating a new range that can iterate either files or
362 input streams chunk-by-chunk.
363 */
365 /** Read CSV from an input source, covert to TSV and write to an output source.
366  *
367  * Params:
368  *   inputSource           =  A "bufferable" input source, either a file open for
369  *                            read or an input range with ubyte elements.
370  *   outputStream          =  An output range to write TSV bytes to.
371  *   readBuffer            =  A buffer to use for reading.
372  *   filename              =  Name of file to use when reporting errors. A descriptive
373  *                            name can be used in lieu of a file name.
374  *   skipLines             =  Number of lines to skip before outputting records.
375  *                            Typically used to skip writing header lines.
376  *   csvQuote              =  The quoting character used in the CSV input.
377  *   csvDelim              =  The field delimiter character used in the CSV input.
378  *   tsvDelim              =  The field delimiter character to use in the TSV output.
379  *   tsvDelimReplacement   =  String to use when replacing TSV field delimiters
380  *                            (e.g. TABs) found in the CSV data fields.
381  *   tsvNewlineReplacement =  String to use when replacing newlines found in the CSV
382  *                            data fields.
383  *   discardBOM            =  If true (the default), a UTF-8 Byte Order Mark found at the
384  *                            start of the input stream will be dropped.
385  *
386  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
387  *         line number where the error was identified.
388  */
389 void csv2tsv(InputSource, OutputRange)(
390     InputSource inputSource,
391     auto ref OutputRange outputStream,
392     ubyte[] readBuffer,
393     string filename = "(none)",
394     size_t skipLines = 0,
395     const char csvQuote = '"',
396     const char csvDelim = ',',
397     const char tsvDelim = '\t',
398     const string tsvDelimReplacement = " ",
399     const string tsvNewlineReplacement = " ",
400     bool discardBOM = true,
401 )
402 if (isBufferableInputSource!InputSource &&
403     isOutputRange!(OutputRange, char))
404 {
405     import std.conv: hexString;
407     assert (readBuffer.length >= 1);
409     enum char LF = '\n';
410     enum char CR = '\r';
412     enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf";
414     /* Process state information - These variables are defined either in the outer
415      * context or within one of the foreach loops.
416      *
417      *   * recordNum - The current CSV input line/record number. Starts at one.
418      *   * fieldNum - Field number in the current line/record. Field numbers are
419      *     one-upped. The field number set to zero at the start of a new record,
420      *     prior to processing the first character of the first field on the record.
421      *   * csvState - The current state of CSV processing. In particular, the state
422      *     of the finite state machine.
423      *   * writeRegionStart - Read buffer index where the next write starts from.
424      *   * nextIndex - The index of the current input ubyte being processed. The
425      *     current write region extends from the writeRegionStart to nextIndex.
426      *   * nextChar - The current input ubyte. The ubyte/char at nextIndex.
427      */
429     enum CSVState
430     {
431      FieldEnd,           // Start of input or after consuming a field or record delimiter.
432      NonQuotedField,     // Processing a non-quoted field
433      QuotedField,        // Processing a quoted field
434      QuoteInQuotedField, // Last char was a quote in a quoted field
435      CRAtFieldEnd,       // Last char was a CR terminating a record/line
436      CRInQuotedField,    // Last char was a CR in a quoted field
437     }
439     CSVState csvState = CSVState.FieldEnd;
440     size_t recordNum = 1;
441     size_t fieldNum = 0;
443     foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate)
444     {
445         size_t writeRegionStart = 0;
447         /* Discard byte order marks at the start of input.
448          * Note: Slicing the chunk in this fashion generates very good code, better
449          * other approaches like manipulating indices.
450          */
451         auto inputChunk =
452             (discardBOM &&
453              chunkIndex == 0 &&
454              inputChunkComplete.length >= UTF8_BOM.length &&
455              inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM
456             )
457             ? inputChunkComplete[UTF8_BOM.length .. $]
458             : inputChunkComplete[];
460         /* flushCurrentRegion flushes the current write region and moves the start of
461          * the next write region one byte past the end of the current region. If
462          * appendChars are provided they are ouput as well.
463          *
464          * This routine is called when the current character (byte) terminates the
465          * current write region and should not itself be output. That is why the next
466          * write region always starts one byte past the current region end.
467          *
468          * This routine is also called when the 'skiplines' region has been processed.
469          * This is done to flush the region without actually writing it. This is done
470          * by the 'nextRecord' routine defined in the foreach loop.
471          */
472         void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "")
473         {
474             assert(regionEnd <= inputChunk.length);
476             if (recordNum > skipLines)
477             {
478                 if (regionEnd > writeRegionStart)
479                 {
480                     outputStream.put(inputChunk[writeRegionStart .. regionEnd]);
481                 }
482                 if (appendChars.length > 0)
483                 {
484                     outputStream.put(appendChars);
485                 }
486             }
488             writeRegionStart = regionEnd + 1;
489         }
491         foreach (size_t nextIndex, char nextChar; inputChunk)
492         {
493             /* nextRecord is used when an end of record (end of line) is found. It
494              * bump the record number moves resets the field number. It also flushes
495              * the current write region if the line we were on was the last line
496              * being skipped at the start of input. Normally the header line.
497              */
498             void nextRecord()
499             {
500                 if (recordNum == skipLines) flushCurrentRegion(nextIndex);
501                 ++recordNum;
502                 fieldNum = 0;
503             }
505         OuterSwitch: final switch (csvState)
506             {
507             case CSVState.FieldEnd:
508                 /* Start of input or after consuming a field terminator. */
509                 ++fieldNum;
511                 /* Note: Can't use switch due to the 'goto case' to the OuterSwitch.  */
512                 if (nextChar == csvQuote)
513                 {
514                     flushCurrentRegion(nextIndex);
515                     csvState = CSVState.QuotedField;
516                     break OuterSwitch;
517                 }
518                 else
519                 {
520                     /* Processing state change only. Don't consume the character. */
521                     csvState = CSVState.NonQuotedField;
522                     goto case CSVState.NonQuotedField;
523                 }
525             case CSVState.NonQuotedField:
526                 switch (nextChar)
527                 {
528                 default:
529                     break OuterSwitch;
530                 case csvDelim:
531                     inputChunk[nextIndex] = tsvDelim;
532                     csvState = CSVState.FieldEnd;
533                     break OuterSwitch;
534                 case LF:
535                     nextRecord();
536                     csvState = CSVState.FieldEnd;
537                     break OuterSwitch;
538                 case CR:
539                     inputChunk[nextIndex] = LF;
540                     nextRecord();
541                     csvState = CSVState.CRAtFieldEnd;
542                     break OuterSwitch;
543                 case tsvDelim:
544                     if (tsvDelimReplacement.length == 1)
545                     {
546                         inputChunk[nextIndex] = tsvDelimReplacement[0];
547                     }
548                     else
549                     {
550                         flushCurrentRegion(nextIndex, tsvDelimReplacement);
551                     }
552                     break OuterSwitch;
553                 }
555             case CSVState.QuotedField:
556                 switch (nextChar)
557                 {
558                 default:
559                     break OuterSwitch;
560                 case csvQuote:
561                     /*
562                      * Flush the current region, without the double quote. Switch state
563                      * to QuoteInQuotedField, which determines whether to output a quote.
564                      */
565                     flushCurrentRegion(nextIndex);
566                     csvState = CSVState.QuoteInQuotedField;
567                     break OuterSwitch;
569                 case tsvDelim:
570                     if (tsvDelimReplacement.length == 1)
571                     {
572                         inputChunk[nextIndex] = tsvDelimReplacement[0];
573                     }
574                     else
575                     {
576                         flushCurrentRegion(nextIndex, tsvDelimReplacement);
577                     }
578                     break OuterSwitch;
579                 case LF:
580                     /* Newline in a quoted field. */
581                     if (tsvNewlineReplacement.length == 1)
582                     {
583                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
584                     }
585                     else
586                     {
587                         flushCurrentRegion(nextIndex, tsvNewlineReplacement);
588                     }
589                     break OuterSwitch;
590                 case CR:
591                     /* Carriage Return in a quoted field. */
592                     if (tsvNewlineReplacement.length == 1)
593                     {
594                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
595                     }
596                     else
597                     {
598                         flushCurrentRegion(nextIndex, tsvNewlineReplacement);
599                     }
600                     csvState = CSVState.CRInQuotedField;
601                     break OuterSwitch;
602                 }
604             case CSVState.QuoteInQuotedField:
605                 /* Just processed a quote in a quoted field. The buffer, without the
606                  * quote, was just flushed. Only legal characters here are quote,
607                  * comma (field delimiter), newline (record delimiter).
608                  */
609                 switch (nextChar)
610                 {
611                 case csvQuote:
612                     csvState = CSVState.QuotedField;
613                     break OuterSwitch;
614                 case csvDelim:
615                     inputChunk[nextIndex] = tsvDelim;
616                     csvState = CSVState.FieldEnd;
617                     break OuterSwitch;
618                 case LF:
619                     nextRecord();
620                     csvState = CSVState.FieldEnd;
621                     break OuterSwitch;
622                 case CR:
623                     inputChunk[nextIndex] = LF;
624                     nextRecord();
625                     csvState = CSVState.CRAtFieldEnd;
626                     break OuterSwitch;
627                 default:
628                     throw new Exception(
629                         format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
630                                (filename == "-") ? "Standard Input" : filename,
631                                recordNum));
632                 }
634             case CSVState.CRInQuotedField:
635                 if (nextChar == LF)
636                 {
637                     flushCurrentRegion(nextIndex);
638                     csvState = CSVState.QuotedField;
639                     break OuterSwitch;
640                 }
641                 else {
642                     /* Naked CR. State change only, don't consume current character. */
643                     csvState = CSVState.QuotedField;
644                     goto case CSVState.QuotedField;
645                 }
647             case CSVState.CRAtFieldEnd:
648                 if (nextChar == LF)
649                 {
650                     flushCurrentRegion(nextIndex);
651                     csvState = CSVState.FieldEnd;
652                     break OuterSwitch;
653                 }
654                 else {
655                     /* Naked CR. State change only, don't consume current character. */
656                     csvState = CSVState.FieldEnd;
657                     goto case CSVState.FieldEnd;
658                 }
659             }
660         }
662         /* End of buffer. */
663         if (writeRegionStart < inputChunk.length && recordNum > skipLines)
664         {
665             outputStream.put(inputChunk[writeRegionStart .. $]);
666         }
668         writeRegionStart = 0;
669     }
671     enforce(csvState != CSVState.QuotedField,
672             format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
673                    (filename == "-") ? "Standard Input" : filename,
674                    recordNum));
676     /* Output a newline if the CSV input did not have a terminating newline. */
677     if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n');
678 }
680 unittest
681 {
682     /* Unit tests for the csv2tsv function.
683      *
684      * These unit tests exercise different CSV combinations and escaping cases. The CSV
685      * data content is the same for each corresponding test string, except the delimiters
686      * have been changed. e.g csv6a and csv6b have the same data content.
687      *
688      * A property used in these tests is that changing the CSV delimiters doesn't change
689      * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
690      * as TSV doesn't support having it's delimiters in the data. This allows having a
691      * single TSV expected set that is generated by CSVs with different delimter sets.
692      *
693      * This test set does not test main, file handling, or error messages. These are
694      * handled by tests run against the executable.
695      *
696      * Note: unittest is non @safe due to the casts from string to ubyte[]. This can
697      * probably be rewritten to use std.string.representation instead, which is @safe.
698      */
700     /* Default CSV. */
701     auto csv1a = "a,b,c";
702     auto csv2a = "a,bc,,,def";
703     auto csv3a = ",a, b , cd ,";
704     auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
705     auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
706     auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
707     auto csv7a = "\",\",\",,\",\",,,\"";
708     auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
709     auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
710     auto csv10a = "";
711     auto csv11a = ",";
712     auto csv12a = ",,";
713     auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
714     auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
715     auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
716     auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
717     auto csv17a = "ab\",ab\"cd";
718     auto csv18a = "\n\n\n";
719     auto csv19a = "\t";
720     auto csv20a = "\t\t";
721     auto csv21a = "a\n";
722     auto csv22a = "a,\n";
723     auto csv23a = "a,b\n";
724     auto csv24a = ",\n";
725     auto csv25a = "#";
726     auto csv26a = "^";
727     auto csv27a = "#^#";
728     auto csv28a = "^#^";
729     auto csv29a = "$";
730     auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
731     auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
732     auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
734     // Newlines terminating a line ending a non-quoted field
735     auto csv33a = "\rX\r\nX\n\r\nX\r\n";
737     // Newlines inside a quoted field and terminating a line following a quoted field
738     auto csv34a = "\"\r\",\"X\r\",\"X\rY\",\"\rY\"\r\"\r\n\",\"X\r\n\",\"X\r\nY\",\"\r\nY\"\r\n\"\n\",\"X\n\",\"X\nY\",\"\nY\"\n";
740     // CR at field end
741     auto csv35a = "abc,def\r\"ghi\",\"jkl\"\r\"mno\",pqr\r";
743     /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
744     auto csv1b = "a^b^c";
745     auto csv2b = "a^bc^^^def";
746     auto csv3b = "^a^ b ^ cd ^";
747     auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
748     auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
749     auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
750     auto csv7b = "#,#^#,,#^#,,,#";
751     auto csv8b = "##^#\"#^#\"\"#";
752     auto csv9b = "#ab, de\tfg\"\nhij#";
753     auto csv10b = "";
754     auto csv11b = "^";
755     auto csv12b = "^^";
756     auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
757     auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
758     auto csv15b = "#ab, de\tfg\"\rhij#";
759     auto csv16b = "#ab, de\tfg\"\r\nhij#";
760     auto csv17b = "ab\"^ab\"cd";
761     auto csv18b = "\n\n\n";
762     auto csv19b = "\t";
763     auto csv20b = "\t\t";
764     auto csv21b = "a\n";
765     auto csv22b = "a^\n";
766     auto csv23b = "a^b\n";
767     auto csv24b = "^\n";
768     auto csv25b = "####";
769     auto csv26b = "#^#";
770     auto csv27b = "###^###";
771     auto csv28b = "#^##^#";
772     auto csv29b = "$";
773     auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
774     auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
775     auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
776     auto csv33b = "\rX\r\nX\n\r\nX\r\n";
777     auto csv34b = "#\r#^#X\r#^#X\rY#^#\rY#\r#\r\n#^#X\r\n#^#X\r\nY#^#\r\nY#\r\n#\n#^#X\n#^#X\nY#^#\nY#\n";
778     auto csv35b = "abc^def\r#ghi#^#jkl#\r#mno#^pqr\r";
780     /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
781     auto tsv1 = "a\tb\tc\n";
782     auto tsv2 = "a\tbc\t\t\tdef\n";
783     auto tsv3 = "\ta\t b \t cd \t\n";
784     auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
785     auto tsv5 = " \t  \t   \n";
786     auto tsv6 = " \t  \t   \n";
787     auto tsv7 = ",\t,,\t,,,\n";
788     auto tsv8 = "\t\"\t\"\"\n";
789     auto tsv9 = "ab, de fg\" hij\n";
790     auto tsv10 = "";
791     auto tsv11 = "\t\n";
792     auto tsv12 = "\t\t\n";
793     auto tsv13 = " \t  \t   \n";
794     auto tsv14 = " \t  \t   \n";
795     auto tsv15 = "ab, de fg\" hij\n";
796     auto tsv16 = "ab, de fg\" hij\n";
797     auto tsv17 = "ab\"\tab\"cd\n";
798     auto tsv18 = "\n\n\n";
799     auto tsv19 = " \n";
800     auto tsv20 = "  \n";
801     auto tsv21 = "a\n";
802     auto tsv22 = "a\t\n";
803     auto tsv23 = "a\tb\n";
804     auto tsv24 = "\t\n";
805     auto tsv25 = "#\n";
806     auto tsv26 = "^\n";
807     auto tsv27 = "#^#\n";
808     auto tsv28 = "^#^\n";
809     auto tsv29 = "$\n";
810     auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
811     auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
812     auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
813     auto tsv33 = "\nX\nX\n\nX\n";
814     auto tsv34 = " \tX \tX Y\t Y\n \tX \tX Y\t Y\n \tX \tX Y\t Y\n";
815     auto tsv35 = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
817     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
818      * This will also result in different replacements when TAB and $ appear in the CSV.
819      */
820     auto tsv1_x = "a$b$c\n";
821     auto tsv2_x = "a$bc$$$def\n";
822     auto tsv3_x = "$a$ b $ cd $\n";
823     auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
824     auto tsv5_x = " $  $   \n";
825     auto tsv6_x = "\t$\t\t$\t\t\t\n";
826     auto tsv7_x = ",$,,$,,,\n";
827     auto tsv8_x = "$\"$\"\"\n";
828     auto tsv9_x = "ab, de\tfg\" hij\n";
829     auto tsv10_x = "";
830     auto tsv11_x = "$\n";
831     auto tsv12_x = "$$\n";
832     auto tsv13_x = " $  $   \n";
833     auto tsv14_x = " $  $   \n";
834     auto tsv15_x = "ab, de\tfg\" hij\n";
835     auto tsv16_x = "ab, de\tfg\" hij\n";
836     auto tsv17_x = "ab\"$ab\"cd\n";
837     auto tsv18_x = "\n\n\n";
838     auto tsv19_x = "\t\n";
839     auto tsv20_x = "\t\t\n";
840     auto tsv21_x = "a\n";
841     auto tsv22_x = "a$\n";
842     auto tsv23_x = "a$b\n";
843     auto tsv24_x = "$\n";
844     auto tsv25_x = "#\n";
845     auto tsv26_x = "^\n";
846     auto tsv27_x = "#^#\n";
847     auto tsv28_x = "^#^\n";
848     auto tsv29_x = " \n";
849     auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
850     auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
851     auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
852     auto tsv33_x = "\nX\nX\n\nX\n";
853     auto tsv34_x = " $X $X Y$ Y\n $X $X Y$ Y\n $X $X Y$ Y\n";
854     auto tsv35_x = "abc$def\nghi$jkl\nmno$pqr\n";
856     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
857      * and with the delimiter/newline replacement string being |--|. Basically, newlines
858      * and '$' in the original data are replaced by |--|.
859      */
860     auto tsv1_y = "a$b$c\n";
861     auto tsv2_y = "a$bc$$$def\n";
862     auto tsv3_y = "$a$ b $ cd $\n";
863     auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
864     auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
865     auto tsv6_y = "\t$\t\t$\t\t\t\n";
866     auto tsv7_y = ",$,,$,,,\n";
867     auto tsv8_y = "$\"$\"\"\n";
868     auto tsv9_y = "ab, de\tfg\"|--|hij\n";
869     auto tsv10_y = "";
870     auto tsv11_y = "$\n";
871     auto tsv12_y = "$$\n";
872     auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
873     auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
874     auto tsv15_y = "ab, de\tfg\"|--|hij\n";
875     auto tsv16_y = "ab, de\tfg\"|--|hij\n";
876     auto tsv17_y = "ab\"$ab\"cd\n";
877     auto tsv18_y = "\n\n\n";
878     auto tsv19_y = "\t\n";
879     auto tsv20_y = "\t\t\n";
880     auto tsv21_y = "a\n";
881     auto tsv22_y = "a$\n";
882     auto tsv23_y = "a$b\n";
883     auto tsv24_y = "$\n";
884     auto tsv25_y = "#\n";
885     auto tsv26_y = "^\n";
886     auto tsv27_y = "#^#\n";
887     auto tsv28_y = "^#^\n";
888     auto tsv29_y = "|--|\n";
889     auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
890     auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
891     auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
892     auto tsv33_y = "\nX\nX\n\nX\n";
893     auto tsv34_y = "|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n|--|$X|--|$X|--|Y$|--|Y\n";
894     auto tsv35_y = "abc$def\nghi$jkl\nmno$pqr\n";
896     /* The TSV results for CSV sets 1a and 1b, but with the TAB replacement as |TAB|
897      * and newline replacement |NL|.
898      */
899     auto tsv1_z = "a\tb\tc\n";
900     auto tsv2_z = "a\tbc\t\t\tdef\n";
901     auto tsv3_z = "\ta\t b \t cd \t\n";
902     auto tsv4_z = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
903     auto tsv5_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
904     auto tsv6_z = "<TAB>\t<TAB><TAB>\t<TAB><TAB><TAB>\n";
905     auto tsv7_z = ",\t,,\t,,,\n";
906     auto tsv8_z = "\t\"\t\"\"\n";
907     auto tsv9_z = "ab, de<TAB>fg\"<NL>hij\n";
908     auto tsv10_z = "";
909     auto tsv11_z = "\t\n";
910     auto tsv12_z = "\t\t\n";
911     auto tsv13_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
912     auto tsv14_z = "<NL>\t<NL><NL>\t<NL><NL><NL>\n";
913     auto tsv15_z = "ab, de<TAB>fg\"<NL>hij\n";
914     auto tsv16_z = "ab, de<TAB>fg\"<NL>hij\n";
915     auto tsv17_z = "ab\"\tab\"cd\n";
916     auto tsv18_z = "\n\n\n";
917     auto tsv19_z = "<TAB>\n";
918     auto tsv20_z = "<TAB><TAB>\n";
919     auto tsv21_z = "a\n";
920     auto tsv22_z = "a\t\n";
921     auto tsv23_z = "a\tb\n";
922     auto tsv24_z = "\t\n";
923     auto tsv25_z = "#\n";
924     auto tsv26_z = "^\n";
925     auto tsv27_z = "#^#\n";
926     auto tsv28_z = "^#^\n";
927     auto tsv29_z = "$\n";
928     auto tsv30_z = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
929     auto tsv31_z = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
930     auto tsv32_z = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
931     auto tsv33_z = "\nX\nX\n\nX\n";
932     auto tsv34_z = "<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n<NL>\tX<NL>\tX<NL>Y\t<NL>Y\n";
933     auto tsv35_z = "abc\tdef\nghi\tjkl\nmno\tpqr\n";
935     /* Aggregate the test data into parallel arrays. */
936     auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
937                      csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
938                      csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
939                      csv31a, csv32a, csv33a, csv34a, csv35a];
941     auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
942                      csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
943                      csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
944                      csv31b, csv32b, csv33b, csv34b, csv35b];
946     auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
947                      tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
948                      tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
949                      tsv31, tsv32, tsv33, tsv34, tsv35];
951     auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
952                        tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
953                        tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
954                        tsv31_x, tsv32_x, tsv33_x, tsv34_x, tsv35_x];
956     auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
957                        tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
958                        tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
959                        tsv31_y, tsv32_y, tsv33_y, tsv34_y, tsv35_y];
961     auto tsvSet1_z  = [tsv1_z, tsv2_z, tsv3_z, tsv4_z, tsv5_z, tsv6_z, tsv7_z, tsv8_z, tsv9_z, tsv10_z,
962                        tsv11_z, tsv12_z, tsv13_z, tsv14_z, tsv15_z, tsv16_z, tsv17_z, tsv18_z, tsv19_z, tsv20_z,
963                        tsv21_z, tsv22_z, tsv23_z, tsv24_z, tsv25_z, tsv26_z, tsv27_z, tsv28_z, tsv29_z, tsv30_z,
964                        tsv31_z, tsv32_z, tsv33_z, tsv34_z, tsv35_z];
966     /* The tests. */
967     auto bufferSizeTests = [1, 2, 3, 8, 128];
969     foreach (bufferSize; bufferSizeTests)
970     {
971         ubyte[] readBuffer = new ubyte[](bufferSize);
973         foreach (i, csva, csvb, tsv, tsv_x, tsv_y, tsv_z; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y, tsvSet1_z))
974         {
975             import std.conv : to;
977             /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
978             ubyte[] csvInputA = cast(ubyte[])csva;
979             ubyte[] csvInputB = cast(ubyte[])csvb;
981             /* CSV Set A vs TSV expected. */
982             auto tsvResultA = appender!(char[])();
983             csv2tsv(csvInputA, tsvResultA, readBuffer, "csvInputA_defaultTSV");
984             assert(tsv == tsvResultA.data,
985                    format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
986                           i + 1, csva, tsv, tsvResultA.data));
988             /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
989             auto tsvResultB = appender!(char[])();
990             csv2tsv(csvInputB, tsvResultB, readBuffer, "csvInputB_defaultTSV", 0, '#', '^');
991             assert(tsv == tsvResultB.data,
992                    format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
993                           i + 1, csvb, tsv, tsvResultB.data));
995             /* CSV Set A and TSV with $ separator.*/
996             csvInputA = cast(ubyte[])csva;
997             auto tsvResult_XA = appender!(char[])();
998             csv2tsv(csvInputA, tsvResult_XA, readBuffer, "csvInputA_TSV_WithDollarDelimiter", 0, '"', ',', '$');
999             assert(tsv_x == tsvResult_XA.data,
1000                    format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1001                           i + 1, csva, tsv_x, tsvResult_XA.data));
1003             /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
1004             csvInputB = cast(ubyte[])csvb;
1005             auto tsvResult_XB = appender!(char[])();
1006             csv2tsv(csvInputB, tsvResult_XB, readBuffer, "csvInputB__TSV_WithDollarDelimiter", 0, '#', '^', '$');
1007             assert(tsv_x == tsvResult_XB.data,
1008                    format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1009                           i + 1, csvb, tsv_x, tsvResult_XB.data));
1011             /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
1012             csvInputA = cast(ubyte[])csva;
1013             auto tsvResult_YA = appender!(char[])();
1014             csv2tsv(csvInputA, tsvResult_YA, readBuffer, "csvInputA_TSV_WithDollarAndDelimReplacement", 0, '"', ',', '$', "|--|", "|--|");
1015             assert(tsv_y == tsvResult_YA.data,
1016                    format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1017                           i + 1, csva, tsv_y, tsvResult_YA.data));
1019             /* CSV Set B and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
1020             csvInputB = cast(ubyte[])csvb;
1021             auto tsvResult_YB = appender!(char[])();
1022             csv2tsv(csvInputB, tsvResult_YB, readBuffer, "csvInputB__TSV_WithDollarAndDelimReplacement", 0, '#', '^', '$', "|--|", "|--|");
1023             assert(tsv_y == tsvResult_YB.data,
1024                    format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1025                           i + 1, csvb, tsv_y, tsvResult_YB.data));
1027             /* CSV Set A and TSV with TAB replacement as <TAB> and newline replacement as <NL>. Same TSV as CSV Set A.*/
1028             csvInputA = cast(ubyte[])csva;
1029             auto tsvResult_ZA = appender!(char[])();
1030             csv2tsv(csvInputA, tsvResult_ZA, readBuffer, "csvInputA_TSV_WithDifferentTABandNLReplacements", 0, '"', ',', '\t', "<TAB>", "<NL>");
1031             assert(tsv_z == tsvResult_ZA.data,
1032                    format("Unittest failure. tsv_z != tsvResult_ZA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1033                           i + 1, csva, tsv_z, tsvResult_ZA.data));
1034         }
1035     }
1036 }
1038 // csv2tsv skiplines tests
1039 unittest
1040 {
1041     import std..string : representation;
1043     auto csv1 = "";
1044     auto csv2 = "a";
1046     auto csv3 = "\n";
1047     auto csv4 = "\n\n";
1048     auto csv5 = "\n\n\n";
1050     auto csv6 = "a\n";
1051     auto csv7 = "a\nb\n";
1052     auto csv8 = "a\nb\nc\n";
1054     auto csv9 = "\"\n\"\n";
1055     auto csv10 = "\"\n\"\n\"\n\"\n";
1056     auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n";
1058     auto csv12 = "\r";
1059     auto csv13 = "\r\r";
1060     auto csv14 = "\r\r\r";
1062     auto csv15 = "a\r";
1063     auto csv16 = "a\rb\r";
1064     auto csv17 = "a\rb\rc\r";
1066     auto csv18 = "\"\r\"\r";
1067     auto csv19 = "\"\r\"\r\"\r\"\r";
1068     auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r";
1070     auto csv21 = "\r\n";
1071     auto csv22 = "\r\n\r\n";
1072     auto csv23 = "\r\n\r\n\r\n";
1074     auto csv24 = "a\r\n";
1075     auto csv25 = "a\r\nb\r\n";
1076     auto csv26 = "a\r\nb\r\nc\r\n";
1078     auto csv27 = "\"\r\n\"\r\n";
1079     auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n";
1080     auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n";
1082     /* The Skip 1 expected results. */
1083     auto tsv1Skip1 = "";
1084     auto tsv2Skip1 = "";
1086     auto tsv3Skip1 = "";
1087     auto tsv4Skip1 = "\n";
1088     auto tsv5Skip1 = "\n\n";
1090     auto tsv6Skip1 = "";
1091     auto tsv7Skip1 = "b\n";
1092     auto tsv8Skip1 = "b\nc\n";
1094     auto tsv9Skip1 = "";
1095     auto tsv10Skip1 = " \n";
1096     auto tsv11Skip1 = " \n \n";
1098     auto tsv12Skip1 = "";
1099     auto tsv13Skip1 = "\n";
1100     auto tsv14Skip1 = "\n\n";
1102     auto tsv15Skip1 = "";
1103     auto tsv16Skip1 = "b\n";
1104     auto tsv17Skip1 = "b\nc\n";
1106     auto tsv18Skip1 = "";
1107     auto tsv19Skip1 = " \n";
1108     auto tsv20Skip1 = " \n \n";
1110     auto tsv21Skip1 = "";
1111     auto tsv22Skip1 = "\n";
1112     auto tsv23Skip1 = "\n\n";
1114     auto tsv24Skip1 = "";
1115     auto tsv25Skip1 = "b\n";
1116     auto tsv26Skip1 = "b\nc\n";
1118     auto tsv27Skip1 = "";
1119     auto tsv28Skip1 = " \n";
1120     auto tsv29Skip1 = " \n \n";
1122     /* The Skip 2 expected results. */
1123     auto tsv1Skip2 = "";
1124     auto tsv2Skip2 = "";
1126     auto tsv3Skip2 = "";
1127     auto tsv4Skip2 = "";
1128     auto tsv5Skip2 = "\n";
1130     auto tsv6Skip2 = "";
1131     auto tsv7Skip2 = "";
1132     auto tsv8Skip2 = "c\n";
1134     auto tsv9Skip2 = "";
1135     auto tsv10Skip2 = "";
1136     auto tsv11Skip2 = " \n";
1138     auto tsv12Skip2 = "";
1139     auto tsv13Skip2 = "";
1140     auto tsv14Skip2 = "\n";
1142     auto tsv15Skip2 = "";
1143     auto tsv16Skip2 = "";
1144     auto tsv17Skip2 = "c\n";
1146     auto tsv18Skip2 = "";
1147     auto tsv19Skip2 = "";
1148     auto tsv20Skip2 = " \n";
1150     auto tsv21Skip2 = "";
1151     auto tsv22Skip2 = "";
1152     auto tsv23Skip2 = "\n";
1154     auto tsv24Skip2 = "";
1155     auto tsv25Skip2 = "";
1156     auto tsv26Skip2 = "c\n";
1158     auto tsv27Skip2 = "";
1159     auto tsv28Skip2 = "";
1160     auto tsv29Skip2 = " \n";
1162     auto csvSet =
1163         [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10,
1164          csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20,
1165          csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29];
1167     auto tsvSkip1Set =
1168         [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1,
1169          tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1,
1170          tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1];
1172     auto tsvSkip2Set =
1173         [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2,
1174          tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2,
1175          tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2];
1177     auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1179     foreach (bufferSize; bufferSizeTests)
1180     {
1181         ubyte[] readBuffer = new ubyte[](bufferSize);
1183         foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set))
1184         {
1185             ubyte[] csvInput = csv.dup.representation;
1186             auto csvToTSVSkip1 = appender!(char[])();
1187             auto csvToTSVSkip2 = appender!(char[])();
1189             csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1);
1191             assert(tsvSkip1 == csvToTSVSkip1.data,
1192                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1193                           i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data));
1195             csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2);
1197             assert(tsvSkip2 == csvToTSVSkip2.data,
1198                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1199                           i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data));
1200         }
1201     }
1202 }
1204 // csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe
1205 unittest
1206 {
1207     import std.conv : hexString;
1208     import std..string : representation;
1210     enum utf8BOM = hexString!"efbbbf";
1212     auto csv1 = "";
1213     auto csv2 = "a";
1214     auto csv3 = "ab";
1215     auto csv4 = "a,b";
1216     auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz";
1218     auto csv1BOM = utf8BOM ~ csv1;
1219     auto csv2BOM = utf8BOM ~ csv2;
1220     auto csv3BOM = utf8BOM ~ csv3;
1221     auto csv4BOM = utf8BOM ~ csv4;
1222     auto csv5BOM = utf8BOM ~ csv5;
1224     auto tsv1 = "";
1225     auto tsv2 = "a\n";
1226     auto tsv3 = "ab\n";
1227     auto tsv4 = "a\tb\n";
1228     auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n";
1230     /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline.
1231      * However, with the BOM prepended the tsv gets a trailing newline.
1232      */
1233     auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n";
1234     auto tsv2BOM = utf8BOM ~ tsv2;
1235     auto tsv3BOM = utf8BOM ~ tsv3;
1236     auto tsv4BOM = utf8BOM ~ tsv4;
1237     auto tsv5BOM = utf8BOM ~ tsv5;
1239     auto csvSet = [csv1, csv2, csv3, csv4, csv5];
1240     auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM];
1242     auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5];
1243     auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM];
1245     auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
1247     foreach (bufferSize; bufferSizeTests)
1248     {
1249         ubyte[] readBuffer = new ubyte[](bufferSize);
1251         foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet))
1252         {
1253             ubyte[] csvInput = csv.dup.representation;
1254             ubyte[] csvBOMInput = csvBOM.dup.representation;
1256             auto csvToTSV = appender!(char[])();
1257             auto csvToTSV_NoBOMRemoval = appender!(char[])();
1258             auto csvBOMToTSV = appender!(char[])();
1259             auto csvBOMToTSV_NoBOMRemoval = appender!(char[])();
1261             csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true);
1262             assert(tsv == csvToTSV.data,
1263                    format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1264                           i + 1, bufferSize, csv, tsv, csvToTSV.data));
1266             csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1267             assert(tsv == csvToTSV_NoBOMRemoval.data,
1268                    format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1269                           i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data));
1271             csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true);
1272             if (readBuffer.length < utf8BOM.length)
1273             {
1274                 /* Removing BOMs, but didn't provide enough buffer, so no removal. */
1275                 assert(tsvBOM == csvBOMToTSV.data,
1276                        format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1277                               i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1278             }
1279             else
1280             {
1281                 assert(tsv == csvBOMToTSV.data,
1282                        format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1283                               i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
1284             }
1286             csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
1287             assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data,
1288                    format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
1289                           i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data));
1290         }
1291     }
1292 }