tsv_utils.csv2tsv source code

1 /**
2 Convert CSV formatted data to TSV format.
3 
4 This program converts comma-separated value data to tab-separated format.
5 
6 Copyright (c) 2016-2020, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 
12 module tsv_utils.csv2tsv;
13 
14 import std.stdio;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.traits : Unqual;
19 import std.typecons : Nullable, tuple;
20 
21 immutable helpText = q"EOS
22 Synopsis: csv2tsv [options] [file...]
23 
24 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
25 are read from files or standard input, converted records written to standard output.
26 Use '--help-verbose' for details the CSV formats accepted.
27 
28 Options:
29 EOS";
30 
31 immutable helpTextVerbose = q"EOS
32 Synopsis: csv2tsv [options] [file...]
33 
34 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
35 are read from files or standard input, converted records written to standard output.
36 
37 Both formats represent tabular data, each record on its own line, fields separated
38 by a delimiter character. The key difference is that CSV uses escape sequences to
39 represent newlines and field separators in the data, whereas TSV disallows these
40 characters in the data. The most common field delimiters are comma for CSV and tab
41 for TSV, but any character can be used.
42 
43 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
44 and replacing newlines and field delimiters in the data. By default, newlines and
45 field delimiters in the data are replaced by spaces. Most details are customizable.
46 
47 There is no single spec for CSV, any number of variants can be found. The escape
48 syntax is common enough: fields containing newlines or field delimiters are placed
49 in double quotes. Inside a quoted field, a double quote is represented by a pair of
50 double quotes. As with field separators, the quoting character is customizable.
51 
52 Behaviors of this program that often vary between CSV implementations:
53   * Newlines are supported in quoted fields.
54   * Double quotes are permitted in a non-quoted field. However, a field starting
55     with a quote must follow quoting rules.
56   * Each record can have a different numbers of fields.
57   * The three common forms of newlines are supported: CR, CRLF, LF.
58   * A newline will be added if the file does not end with one.
59   * No whitespace trimming is done.
60 
61 This program does not validate CSV correctness, but will terminate with an error
62 upon reaching an inconsistent state. Improperly terminated quoted fields are the
63 primary cause.
64 
65 UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
66 
67 Options:
68 EOS";
69 
70 /** Container for command line options.
71  */
72 struct Csv2tsvOptions
73 {
74     string programName;
75     bool helpVerbose = false;          // --help-verbose
76     bool hasHeader = false;            // --H|header
77     char csvQuoteChar = '"';           // --q|quote
78     char csvDelimChar = ',';           // --c|csv-delim
79     char tsvDelimChar = '\t';          // --t|tsv-delim
80     string tsvDelimReplacement = " ";  // --r|replacement
81     bool versionWanted = false;        // --V|version
82 
83     auto processArgs (ref string[] cmdArgs)
84     {
85         import std.algorithm : canFind;
86         import std.getopt;
87         import std.path : baseName, stripExtension;
88 
89         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
90 
91         try
92         {
93             auto r = getopt(
94                 cmdArgs,
95                 "help-verbose",  "     Print full help.", &helpVerbose,
96                 std.getopt.config.caseSensitive,
97                 "H|header",      "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
98                 std.getopt.config.caseSensitive,
99                 "q|quote",       "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
100                 "c|csv-delim",   "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
101                 "t|tsv-delim",   "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
102                 "r|replacement", "STR  Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement,
103                 std.getopt.config.caseSensitive,
104                 "V|version",     "     Print version information and exit.", &versionWanted,
105                 std.getopt.config.caseInsensitive,
106                 );
107 
108             if (r.helpWanted)
109             {
110                 defaultGetoptPrinter(helpText, r.options);
111                 return tuple(false, 0);
112             }
113             else if (helpVerbose)
114             {
115                 defaultGetoptPrinter(helpTextVerbose, r.options);
116                 return tuple(false, 0);
117             }
118             else if (versionWanted)
119             {
120                 import tsv_utils.common.tsvutils_version;
121                 writeln(tsvutilsVersionNotice("csv2tsv"));
122                 return tuple(false, 0);
123             }
124 
125             /* Consistency checks. */
126             enforce(csvQuoteChar != '\n' && csvQuoteChar != '\r',
127                     "CSV quote character cannot be newline (--q|quote).");
128 
129             enforce(csvQuoteChar != csvDelimChar,
130                     "CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
131 
132             enforce(csvQuoteChar != tsvDelimChar,
133                     "CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
134 
135             enforce(csvDelimChar != '\n' && csvDelimChar != '\r',
136                     "CSV field delimiter cannot be newline (--c|csv-delim).");
137 
138             enforce(tsvDelimChar != '\n' && tsvDelimChar != '\r',
139                     "TSV field delimiter cannot be newline (--t|tsv-delim).");
140 
141             enforce(!canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement),
142                     "Replacement character cannot contain newlines or TSV field delimiters (--r|replacement).");
143         }
144         catch (Exception exc)
145         {
146             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
147             return tuple(false, 1);
148         }
149         return tuple(true, 0);
150     }
151 }
152 
153 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
154 
155 version(unittest)
156 {
157     // No main in unittest
158 }
159 else
160 {
161     int main(string[] cmdArgs)
162     {
163         /* When running in DMD code coverage mode, turn on report merging. */
164         version(D_Coverage) version(DigitalMars)
165         {
166             import core.runtime : dmd_coverSetMerge;
167             dmd_coverSetMerge(true);
168         }
169 
170         Csv2tsvOptions cmdopt;
171         const r = cmdopt.processArgs(cmdArgs);
172         if (!r[0]) return r[1];
173         version(LDC_Profile)
174         {
175             import ldc.profile : resetAll;
176             resetAll();
177         }
178         try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
179         catch (Exception exc)
180         {
181             writeln();
182             stdin.flush();
183             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
184             return 1;
185         }
186 
187         return 0;
188     }
189 }
190 
191 /* This uses a D feature where a type can reserve a single value to represent null. */
192 alias NullableSizeT = Nullable!(size_t, size_t.max);
193 
194 
195 /** csv2tsvFiles reads multiple files and standard input and writes the results to
196  * standard output.
197  */
198 void csv2tsvFiles(const ref Csv2tsvOptions cmdopt, const string[] inputFiles)
199 {
200     import std.algorithm : joiner;
201     import tsv_utils.common.utils : BufferedOutputRange;
202 
203     ubyte[1024 * 128] fileRawBuf;
204     ubyte[] stdinRawBuf = fileRawBuf[0..1024];
205     auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout);
206     bool firstFile = true;
207 
208     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
209     {
210         auto ubyteChunkedStream = (filename == "-") ?
211             stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf);
212         auto ubyteStream = ubyteChunkedStream.joiner;
213 
214         if (firstFile || !cmdopt.hasHeader)
215         {
216             csv2tsv(ubyteStream, stdoutWriter, filename, 0,
217                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
218                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
219         }
220         else
221         {
222             /* Don't write the header on subsequent files. Write the first
223              * record to a null sink instead.
224              */
225             auto nullWriter = NullSink();
226             csv2tsv(ubyteStream, nullWriter, filename, 0,
227                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
228                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
229                     NullableSizeT(1));
230             csv2tsv(ubyteStream, stdoutWriter, filename, 1,
231                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
232                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
233         }
234         firstFile = false;
235     }
236 }
237 
238 /** Read CSV from an input source, covert to TSV and write to an output source.
239  *
240  * Params:
241  *   InputRange          =  A ubyte input range to read CSV text from. A ubyte range
242  *                          matched byChunck. It also avoids convesion to dchar by front().
243  *   OutputRange         =  An output range to write TSV text to.
244  *   filename            =  Name of file to use when reporting errors. A descriptive name
245  *                       =  can be used in lieu of a file name.
246  *   currFileLineNumber  =  First line being processed. Used when reporting errors. Needed
247  *                          only when part of the input has already been processed.
248  *   csvQuote            =  The quoting character used in the input CSV file.
249  *   csvDelim            =  The field delimiter character used in the input CSV file.
250  *   tsvDelim            =  The field delimiter character to use in the generated TSV file.
251  *   tsvDelimReplacement =  A string to use when replacing newlines and TSV field delimiters
252  *                          occurring in CSV fields.
253  *   maxRecords          =  The maximum number of records to process (output lines). This is
254  *                          intended to support processing the header line separately.
255  *
256  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
257  *         line number where the error was identified.
258  */
259 void csv2tsv(InputRange, OutputRange)
260     (auto ref InputRange inputStream, auto ref OutputRange outputStream,
261      string filename = "(none)", size_t currFileLineNumber = 0,
262      const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t',
263      string tsvDelimReplacement = " ",
264      NullableSizeT maxRecords=NullableSizeT.init,
265      )
266 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) &&
267     is(Unqual!(ElementType!InputRange) == ubyte))
268 {
269     enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField }
270 
271     State currState = State.FieldEnd;
272     size_t recordNum = 1;      // Record number. Output line number.
273     size_t fieldNum = 0;       // Field on current line.
274 
275 InputLoop: while (!inputStream.empty)
276     {
277         char nextChar = inputStream.front;
278         inputStream.popFront;
279 
280         if (nextChar == '\r')
281         {
282             /* Collapse newline cases to '\n'. */
283             if (!inputStream.empty && inputStream.front == '\n')
284             {
285                 inputStream.popFront;
286             }
287             nextChar = '\n';
288         }
289 
290     OuterSwitch: final switch (currState)
291         {
292         case State.FieldEnd:
293             /* Start of input or after consuming a field terminator. */
294             ++fieldNum;
295 
296             /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch.  */
297             if (nextChar == csvQuote)
298             {
299                 currState = State.QuotedField;
300                 break OuterSwitch;
301             }
302             else
303             {
304                 /* Processing state change only. Don't consume the character. */
305                 currState = State.NonQuotedField;
306                 goto case State.NonQuotedField;
307             }
308 
309         case State.NonQuotedField:
310             switch (nextChar)
311             {
312             default:
313                 put(outputStream, nextChar);
314                 break OuterSwitch;
315             case csvDelim:
316                 put(outputStream, tsvDelim);
317                 currState = State.FieldEnd;
318                 break OuterSwitch;
319             case tsvDelim:
320                 put(outputStream, tsvDelimReplacement);
321                 break OuterSwitch;
322             case '\n':
323                 put(outputStream, '\n');
324                 ++recordNum;
325                 fieldNum = 0;
326                 currState = State.FieldEnd;
327                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
328                 else break OuterSwitch;
329             }
330 
331         case State.QuotedField:
332             switch (nextChar)
333             {
334             default:
335                 put(outputStream, nextChar);
336                 break OuterSwitch;
337             case csvQuote:
338                 /* Quote in a quoted field. Need to look at the next character.*/
339                 if (!inputStream.empty)
340                 {
341                     currState = State.QuoteInQuotedField;
342                 }
343                 else
344                 {
345                     /* End of input. A rare case: Quoted field on last line with no
346                      * following trailing newline. Reset the state to avoid triggering
347                      * an invalid quoted field exception, plus adding additional newline.
348                      */
349                     currState = State.FieldEnd;
350                 }
351                 break OuterSwitch;
352             case '\n':
353                 /* Newline in a quoted field. */
354                 put(outputStream, tsvDelimReplacement);
355                 break OuterSwitch;
356             case tsvDelim:
357                 put(outputStream, tsvDelimReplacement);
358                 break OuterSwitch;
359             }
360 
361         case State.QuoteInQuotedField:
362             /* Just processed a quote in a quoted field. */
363             switch (nextChar)
364             {
365             case csvQuote:
366                 put(outputStream, csvQuote);
367                 currState = State.QuotedField;
368                 break OuterSwitch;
369             case csvDelim:
370                 put(outputStream, tsvDelim);
371                 currState = State.FieldEnd;
372                 break OuterSwitch;
373             case '\n':
374                 put(outputStream, '\n');
375                 ++recordNum;
376                 fieldNum = 0;
377                 currState = State.FieldEnd;
378 
379                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
380                 else break OuterSwitch;
381             default:
382                 throw new Exception(
383                     format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
384                            (filename == "-") ? "Standard Input" : filename,
385                            currFileLineNumber + recordNum));
386             }
387         }
388     }
389 
390     enforce(currState != State.QuotedField,
391             format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
392                    (filename == "-") ? "Standard Input" : filename,
393                    currFileLineNumber + recordNum));
394 
395     if (fieldNum > 0) put(outputStream, '\n');    // Last line w/o terminating newline.
396 }
397 
398 unittest
399 {
400     /* Unit tests for the csv2tsv function.
401      *
402      * These unit tests exercise different CSV combinations and escaping cases. The CSV
403      * data content is the same for each corresponding test string, except the delimiters
404      * have been changed. e.g csv6a and csv6b have the same data content.
405      *
406      * A property used in these tests is that changing the CSV delimiters doesn't change
407      * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
408      * as TSV doesn't support having it's delimiters in the data. This allows having a
409      * single TSV expected set that is generated by CSVs with different delimter sets.
410      *
411      * This test set does not test main, file handling, or error messages. These are
412      * handled by tests run against the executable.
413      */
414 
415     /* Default CSV. */
416     auto csv1a = "a,b,c";
417     auto csv2a = "a,bc,,,def";
418     auto csv3a = ",a, b , cd ,";
419     auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
420     auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
421     auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
422     auto csv7a = "\",\",\",,\",\",,,\"";
423     auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
424     auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
425     auto csv10a = "";
426     auto csv11a = ",";
427     auto csv12a = ",,";
428     auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
429     auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
430     auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
431     auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
432     auto csv17a = "ab\",ab\"cd";
433     auto csv18a = "\n\n\n";
434     auto csv19a = "\t";
435     auto csv20a = "\t\t";
436     auto csv21a = "a\n";
437     auto csv22a = "a,\n";
438     auto csv23a = "a,b\n";
439     auto csv24a = ",\n";
440     auto csv25a = "#";
441     auto csv26a = "^";
442     auto csv27a = "#^#";
443     auto csv28a = "^#^";
444     auto csv29a = "$";
445     auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
446     auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
447     auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
448 
449     /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
450     auto csv1b = "a^b^c";
451     auto csv2b = "a^bc^^^def";
452     auto csv3b = "^a^ b ^ cd ^";
453     auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
454     auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
455     auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
456     auto csv7b = "#,#^#,,#^#,,,#";
457     auto csv8b = "##^#\"#^#\"\"#";
458     auto csv9b = "#ab, de\tfg\"\nhij#";
459     auto csv10b = "";
460     auto csv11b = "^";
461     auto csv12b = "^^";
462     auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
463     auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
464     auto csv15b = "#ab, de\tfg\"\rhij#";
465     auto csv16b = "#ab, de\tfg\"\r\nhij#";
466     auto csv17b = "ab\"^ab\"cd";
467     auto csv18b = "\n\n\n";
468     auto csv19b = "\t";
469     auto csv20b = "\t\t";
470     auto csv21b = "a\n";
471     auto csv22b = "a^\n";
472     auto csv23b = "a^b\n";
473     auto csv24b = "^\n";
474     auto csv25b = "####";
475     auto csv26b = "#^#";
476     auto csv27b = "###^###";
477     auto csv28b = "#^##^#";
478     auto csv29b = "$";
479     auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
480     auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
481     auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
482 
483     /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
484     auto tsv1 = "a\tb\tc\n";
485     auto tsv2 = "a\tbc\t\t\tdef\n";
486     auto tsv3 = "\ta\t b \t cd \t\n";
487     auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
488     auto tsv5 = " \t  \t   \n";
489     auto tsv6 = " \t  \t   \n";
490     auto tsv7 = ",\t,,\t,,,\n";
491     auto tsv8 = "\t\"\t\"\"\n";
492     auto tsv9 = "ab, de fg\" hij\n";
493     auto tsv10 = "";
494     auto tsv11 = "\t\n";
495     auto tsv12 = "\t\t\n";
496     auto tsv13 = " \t  \t   \n";
497     auto tsv14 = " \t  \t   \n";
498     auto tsv15 = "ab, de fg\" hij\n";
499     auto tsv16 = "ab, de fg\" hij\n";
500     auto tsv17 = "ab\"\tab\"cd\n";
501     auto tsv18 = "\n\n\n";
502     auto tsv19 = " \n";
503     auto tsv20 = "  \n";
504     auto tsv21 = "a\n";
505     auto tsv22 = "a\t\n";
506     auto tsv23 = "a\tb\n";
507     auto tsv24 = "\t\n";
508     auto tsv25 = "#\n";
509     auto tsv26 = "^\n";
510     auto tsv27 = "#^#\n";
511     auto tsv28 = "^#^\n";
512     auto tsv29 = "$\n";
513     auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
514     auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
515     auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
516 
517     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
518      * This will also result in different replacements when TAB and $ appear in the CSV.
519      */
520     auto tsv1_x = "a$b$c\n";
521     auto tsv2_x = "a$bc$$$def\n";
522     auto tsv3_x = "$a$ b $ cd $\n";
523     auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
524     auto tsv5_x = " $  $   \n";
525     auto tsv6_x = "\t$\t\t$\t\t\t\n";
526     auto tsv7_x = ",$,,$,,,\n";
527     auto tsv8_x = "$\"$\"\"\n";
528     auto tsv9_x = "ab, de\tfg\" hij\n";
529     auto tsv10_x = "";
530     auto tsv11_x = "$\n";
531     auto tsv12_x = "$$\n";
532     auto tsv13_x = " $  $   \n";
533     auto tsv14_x = " $  $   \n";
534     auto tsv15_x = "ab, de\tfg\" hij\n";
535     auto tsv16_x = "ab, de\tfg\" hij\n";
536     auto tsv17_x = "ab\"$ab\"cd\n";
537     auto tsv18_x = "\n\n\n";
538     auto tsv19_x = "\t\n";
539     auto tsv20_x = "\t\t\n";
540     auto tsv21_x = "a\n";
541     auto tsv22_x = "a$\n";
542     auto tsv23_x = "a$b\n";
543     auto tsv24_x = "$\n";
544     auto tsv25_x = "#\n";
545     auto tsv26_x = "^\n";
546     auto tsv27_x = "#^#\n";
547     auto tsv28_x = "^#^\n";
548     auto tsv29_x = " \n";
549     auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
550     auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
551     auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
552 
553     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
554      * and with the delimiter/newline replacement string being |--|. Basically, newlines
555      * and '$' in the original data are replaced by |--|.
556      */
557     auto tsv1_y = "a$b$c\n";
558     auto tsv2_y = "a$bc$$$def\n";
559     auto tsv3_y = "$a$ b $ cd $\n";
560     auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
561     auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
562     auto tsv6_y = "\t$\t\t$\t\t\t\n";
563     auto tsv7_y = ",$,,$,,,\n";
564     auto tsv8_y = "$\"$\"\"\n";
565     auto tsv9_y = "ab, de\tfg\"|--|hij\n";
566     auto tsv10_y = "";
567     auto tsv11_y = "$\n";
568     auto tsv12_y = "$$\n";
569     auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
570     auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
571     auto tsv15_y = "ab, de\tfg\"|--|hij\n";
572     auto tsv16_y = "ab, de\tfg\"|--|hij\n";
573     auto tsv17_y = "ab\"$ab\"cd\n";
574     auto tsv18_y = "\n\n\n";
575     auto tsv19_y = "\t\n";
576     auto tsv20_y = "\t\t\n";
577     auto tsv21_y = "a\n";
578     auto tsv22_y = "a$\n";
579     auto tsv23_y = "a$b\n";
580     auto tsv24_y = "$\n";
581     auto tsv25_y = "#\n";
582     auto tsv26_y = "^\n";
583     auto tsv27_y = "#^#\n";
584     auto tsv28_y = "^#^\n";
585     auto tsv29_y = "|--|\n";
586     auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
587     auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
588     auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
589 
590     auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
591                      csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
592                      csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
593                      csv31a, csv32a];
594 
595     auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
596                      csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
597                      csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
598                      csv31b, csv32b];
599 
600     auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
601                      tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
602                      tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
603                      tsv31, tsv32];
604 
605     auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
606                        tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
607                        tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
608                        tsv31_x, tsv32_x];
609 
610     auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
611                        tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
612                        tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
613                        tsv31_y, tsv32_y];
614 
615     foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y))
616     {
617         import std.conv : to;
618 
619         /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
620         ubyte[] csvInputA = cast(ubyte[])csva;
621         ubyte[] csvInputB = cast(ubyte[])csvb;
622 
623         /* CSV Set A vs TSV expected. */
624         auto tsvResultA = appender!(char[])();
625         csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i);
626         assert(tsv == tsvResultA.data,
627                format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
628                       i + 1, csva, tsv, tsvResultA.data));
629 
630         /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
631         auto tsvResultB = appender!(char[])();
632         csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^');
633         assert(tsv == tsvResultB.data,
634                format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
635                       i + 1, csvb, tsv, tsvResultB.data));
636 
637         /* CSV Set A and TSV with $ separator.*/
638         csvInputA = cast(ubyte[])csva;
639         auto tsvResult_XA = appender!(char[])();
640         csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$');
641         assert(tsv_x == tsvResult_XA.data,
642                format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
643                       i + 1, csva, tsv_x, tsvResult_XA.data));
644 
645         /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
646         csvInputB = cast(ubyte[])csvb;
647         auto tsvResult_XB = appender!(char[])();
648         csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$');
649         assert(tsv_x == tsvResult_XB.data,
650                format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
651                       i + 1, csvb, tsv_x, tsvResult_XB.data));
652 
653         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
654         csvInputA = cast(ubyte[])csva;
655         auto tsvResult_YA = appender!(char[])();
656         csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|");
657         assert(tsv_y == tsvResult_YA.data,
658                format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
659                       i + 1, csva, tsv_y, tsvResult_YA.data));
660 
661         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
662         csvInputB = cast(ubyte[])csvb;
663         auto tsvResult_YB = appender!(char[])();
664         csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|");
665         assert(tsv_y == tsvResult_YB.data,
666                format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
667                       i + 1, csvb, tsv_y, tsvResult_YB.data));
668 
669     }
670 }
671 
672 unittest
673 {
674     /* Unit tests for 'maxRecords' feature of the csv2tsv function.
675      */
676 
677     /* Input CSV. */
678     auto csv1 = "";
679     auto csv2 = ",";
680     auto csv3 = "a";
681     auto csv4 = "a\n";
682     auto csv5 = "a\nb";
683     auto csv6 = "a\nb\n";
684     auto csv7 = "a\nb\nc";
685     auto csv8 = "a\nb\nc\n";
686     auto csv9 = "a,aa";
687     auto csv10 = "a,aa\n";
688     auto csv11 = "a,aa\nb,bb";
689     auto csv12 = "a,aa\nb,bb\n";
690     auto csv13 = "a,aa\nb,bb\nc,cc";
691     auto csv14 = "a,aa\nb,bb\nc,cc\n";
692 
693     auto csv15 = "\"a\",\"aa\"";
694     auto csv16 = "\"a\",\"aa\"\n";
695     auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\"";
696     auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n";
697     auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"";
698     auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n";
699 
700     /* TSV with max 1 record. */
701     auto tsv1_max1 = "";
702     auto tsv2_max1 = "\t\n";
703     auto tsv3_max1 = "a\n";
704     auto tsv4_max1 = "a\n";
705     auto tsv5_max1 = "a\n";
706     auto tsv6_max1 = "a\n";
707     auto tsv7_max1 = "a\n";
708     auto tsv8_max1 = "a\n";
709     auto tsv9_max1 = "a\taa\n";
710     auto tsv10_max1 = "a\taa\n";
711     auto tsv11_max1 = "a\taa\n";
712     auto tsv12_max1 = "a\taa\n";
713     auto tsv13_max1 = "a\taa\n";
714     auto tsv14_max1 = "a\taa\n";
715 
716     auto tsv15_max1 = "a\taa\n";
717     auto tsv16_max1 = "a\taa\n";
718     auto tsv17_max1 = "a\taa\n";
719     auto tsv18_max1 = "a\taa\n";
720     auto tsv19_max1 = "a\taa\n";
721     auto tsv20_max1 = "a\taa\n";
722 
723     /* Remaining TSV converted after first call. */
724     auto tsv1_max1_rest = "";
725     auto tsv2_max1_rest = "";
726     auto tsv3_max1_rest = "";
727     auto tsv4_max1_rest = "";
728     auto tsv5_max1_rest = "b\n";
729     auto tsv6_max1_rest = "b\n";
730     auto tsv7_max1_rest = "b\nc\n";
731     auto tsv8_max1_rest = "b\nc\n";
732     auto tsv9_max1_rest = "";
733     auto tsv10_max1_rest = "";
734     auto tsv11_max1_rest = "b\tbb\n";
735     auto tsv12_max1_rest = "b\tbb\n";
736     auto tsv13_max1_rest = "b\tbb\nc\tcc\n";
737     auto tsv14_max1_rest = "b\tbb\nc\tcc\n";
738 
739     auto tsv15_max1_rest = "";
740     auto tsv16_max1_rest = "";
741     auto tsv17_max1_rest = "b\tbb\n";
742     auto tsv18_max1_rest = "b\tbb\n";
743     auto tsv19_max1_rest = "b\tbb\nc\tcc\n";
744     auto tsv20_max1_rest = "b\tbb\nc\tcc\n";
745 
746     /* TSV with max 2 records. */
747     auto tsv1_max2 = "";
748     auto tsv2_max2 = "\t\n";
749     auto tsv3_max2 = "a\n";
750     auto tsv4_max2 = "a\n";
751     auto tsv5_max2 = "a\nb\n";
752     auto tsv6_max2 = "a\nb\n";
753     auto tsv7_max2 = "a\nb\n";
754     auto tsv8_max2 = "a\nb\n";
755     auto tsv9_max2 = "a\taa\n";
756     auto tsv10_max2 = "a\taa\n";
757     auto tsv11_max2 = "a\taa\nb\tbb\n";
758     auto tsv12_max2 = "a\taa\nb\tbb\n";
759     auto tsv13_max2 = "a\taa\nb\tbb\n";
760     auto tsv14_max2 = "a\taa\nb\tbb\n";
761 
762     auto tsv15_max2 = "a\taa\n";
763     auto tsv16_max2 = "a\taa\n";
764     auto tsv17_max2 = "a\taa\nb\tbb\n";
765     auto tsv18_max2 = "a\taa\nb\tbb\n";
766     auto tsv19_max2 = "a\taa\nb\tbb\n";
767     auto tsv20_max2 = "a\taa\nb\tbb\n";
768 
769     /* Remaining TSV converted after first call. */
770     auto tsv1_max2_rest = "";
771     auto tsv2_max2_rest = "";
772     auto tsv3_max2_rest = "";
773     auto tsv4_max2_rest = "";
774     auto tsv5_max2_rest = "";
775     auto tsv6_max2_rest = "";
776     auto tsv7_max2_rest = "c\n";
777     auto tsv8_max2_rest = "c\n";
778     auto tsv9_max2_rest = "";
779     auto tsv10_max2_rest = "";
780     auto tsv11_max2_rest = "";
781     auto tsv12_max2_rest = "";
782     auto tsv13_max2_rest = "c\tcc\n";
783     auto tsv14_max2_rest = "c\tcc\n";
784 
785     auto tsv15_max2_rest = "";
786     auto tsv16_max2_rest = "";
787     auto tsv17_max2_rest = "";
788     auto tsv18_max2_rest = "";
789     auto tsv19_max2_rest = "c\tcc\n";
790     auto tsv20_max2_rest = "c\tcc\n";
791 
792     auto csvSet1 =
793         [csv1, csv2, csv3, csv4, csv5, csv6, csv7,
794          csv8, csv9, csv10, csv11, csv12, csv13, csv14,
795          csv15, csv16, csv17, csv18, csv19, csv20 ];
796 
797     auto tsvMax1Set1 =
798         [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1,
799          tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1,
800          tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1];
801 
802     auto tsvMax1RestSet1 =
803         [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest,
804          tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest,
805          tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest];
806 
807     auto tsvMax2Set1 =
808         [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2,
809          tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2,
810          tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2];
811 
812     auto tsvMax2RestSet1 =
813         [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest,
814          tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest,
815          tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest];
816 
817     foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest;
818              lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1))
819     {
820         /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
821         ubyte[] csvInput = cast(ubyte[])csv;
822 
823         /* Call with maxRecords == 1. */
824         auto tsvMax1Result = appender!(char[])();
825         csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1));
826         assert(tsv_max1 == tsvMax1Result.data,
827                format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
828                       i + 1, csv, tsv_max1, tsvMax1Result.data));
829 
830         /* Follow-up call getting all records remaining after the maxRecords==1 call. */
831         auto tsvMax1RestResult = appender!(char[])();
832         csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i);
833         assert(tsv_max1_rest == tsvMax1RestResult.data,
834                format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
835                       i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data));
836 
837         /* Reset the input stream for maxRecords == 2. */
838         csvInput = cast(ubyte[])csv;
839 
840         /* Call with maxRecords == 2. */
841         auto tsvMax2Result = appender!(char[])();
842         csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2));
843         assert(tsv_max2 == tsvMax2Result.data,
844                format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
845                       i + 1, csv, tsv_max2, tsvMax2Result.data));
846 
847         /* Follow-up call getting all records remaining after the maxRecords==2 call. */
848         auto tsvMax2RestResult = appender!(char[])();
849         csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i);
850         assert(tsv_max2_rest == tsvMax2RestResult.data,
851                format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
852                       i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data));
853     }
854 }