tsv_utils.csv2tsv source code

1 /**
2 Convert CSV formatted data to TSV format.
3 
4 This program converts comma-separated value data to tab-separated format.
5 
6 Copyright (c) 2016-2019, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 
12 module tsv_utils.csv2tsv;
13 
14 import std.stdio;
15 import std.format : format;
16 import std.range;
17 import std.traits : Unqual;
18 import std.typecons : Nullable, tuple;
19 
20 immutable helpText = q"EOS
21 Synopsis: csv2tsv [options] [file...]
22 
23 csv2tsv converts comma-separated text (CSV) to tab-separated format (TSV). Records
24 are read from files or standard input, converted records written to standard output.
25 Use '--help-verbose' for details the CSV formats accepted.
26 
27 Options:
28 EOS";
29 
30 immutable helpTextVerbose = q"EOS
31 Synopsis: csv2tsv [options] [file...]
32 
33 csv2tsv converts CSV (comma-separated) text to TSV (tab-separated) format. Records
34 are read from files or standard input, converted records written to standard output.
35 
36 Both formats represent tabular data, each record on its own line, fields separated
37 by a delimiter character. The key difference is that CSV uses escape sequences to
38 represent newlines and field separators in the data, whereas TSV disallows these
39 characters in the data. The most common field delimiters are comma for CSV and tab
40 for TSV, but any character can be used.
41 
42 Conversion to TSV is done by removing CSV escape syntax, changing field delimiters,
43 and replacing newlines and field delimiters in the data. By default, newlines and
44 field delimiters in the data are replaced by spaces. Most details are customizable.
45 
46 There is no single spec for CSV, any number of variants can be found. The escape
47 syntax is common enough: fields containing newlines or field delimiters are placed
48 in double quotes. Inside a quoted field, a double quote is represented by a pair of
49 double quotes. As with field separators, the quoting character is customizable.
50 
51 Behaviors of this program that often vary between CSV implementations:
52   * Newlines are supported in quoted fields.
53   * Double quotes are permitted in a non-quoted field. However, a field starting
54     with a quote must follow quoting rules.
55   * Each record can have a different numbers of fields.
56   * The three common forms of newlines are supported: CR, CRLF, LF.
57   * A newline will be added if the file does not end with one.
58   * No whitespace trimming is done.
59 
60 This program does not validate CSV correctness, but will terminate with an error
61 upon reaching an inconsistent state. Improperly terminated quoted fields are the
62 primary cause.
63 
64 UTF-8 input is assumed. Convert other encodings prior to invoking this tool.
65 
66 Options:
67 EOS";
68 
69 /** Container for command line options.
70  */
71 struct Csv2tsvOptions
72 {
73     string programName;
74     bool helpVerbose = false;          // --help-verbose
75     bool hasHeader = false;            // --H|header
76     char csvQuoteChar = '"';           // --q|quote
77     char csvDelimChar = ',';           // --c|csv-delim
78     char tsvDelimChar = '\t';          // --t|tsv-delim
79     string tsvDelimReplacement = " ";  // --r|replacement
80     bool versionWanted = false;        // --V|version
81 
82     auto processArgs (ref string[] cmdArgs)
83     {
84         import std.algorithm : canFind;
85         import std.getopt;
86         import std.path : baseName, stripExtension;
87 
88         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
89 
90         try
91         {
92             auto r = getopt(
93                 cmdArgs,
94                 "help-verbose",  "     Print full help.", &helpVerbose,
95                 std.getopt.config.caseSensitive,
96                 "H|header",      "     Treat the first line of each file as a header. Only the header of the first file is output.", &hasHeader,
97                 std.getopt.config.caseSensitive,
98                 "q|quote",       "CHR  Quoting character in CSV data. Default: double-quote (\")", &csvQuoteChar,
99                 "c|csv-delim",   "CHR  Field delimiter in CSV data. Default: comma (,).", &csvDelimChar,
100                 "t|tsv-delim",   "CHR  Field delimiter in TSV data. Default: TAB", &tsvDelimChar,
101                 "r|replacement", "STR  Replacement for newline and TSV field delimiters found in CSV input. Default: Space.", &tsvDelimReplacement,
102                 std.getopt.config.caseSensitive,
103                 "V|version",     "     Print version information and exit.", &versionWanted,
104                 std.getopt.config.caseInsensitive,
105                 );
106 
107             if (r.helpWanted)
108             {
109                 defaultGetoptPrinter(helpText, r.options);
110                 return tuple(false, 0);
111             }
112             else if (helpVerbose)
113             {
114                 defaultGetoptPrinter(helpTextVerbose, r.options);
115                 return tuple(false, 0);
116             }
117             else if (versionWanted)
118             {
119                 import tsv_utils.common.tsvutils_version;
120                 writeln(tsvutilsVersionNotice("csv2tsv"));
121                 return tuple(false, 0);
122             }
123 
124             /* Consistency checks. */
125             if (csvQuoteChar == '\n' || csvQuoteChar == '\r')
126             {
127                 throw new Exception ("CSV quote character cannot be newline (--q|quote).");
128             }
129 
130             if (csvQuoteChar == csvDelimChar)
131             {
132                 throw new Exception("CSV quote and CSV field delimiter characters must be different (--q|quote, --c|csv-delim).");
133             }
134 
135             if (csvQuoteChar == tsvDelimChar)
136             {
137                 throw new Exception("CSV quote and TSV field delimiter characters must be different (--q|quote, --t|tsv-delim).");
138             }
139 
140             if (csvDelimChar == '\n' || csvDelimChar == '\r')
141             {
142                 throw new Exception ("CSV field delimiter cannot be newline (--c|csv-delim).");
143             }
144 
145             if (tsvDelimChar == '\n' || tsvDelimChar == '\r')
146             {
147                 throw new Exception ("TSV field delimiter cannot be newline (--t|tsv-delimiter).");
148             }
149 
150             if (canFind!(c => (c == '\n' || c == '\r' || c == tsvDelimChar))(tsvDelimReplacement))
151             {
152                 throw new Exception ("Replacement character cannot contain newlines or TSV field delimiters (--r|replacement).");
153             }
154         }
155         catch (Exception exc)
156         {
157             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
158             return tuple(false, 1);
159         }
160         return tuple(true, 0);
161     }
162 }
163 
164 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
165 
166 version(unittest)
167 {
168     // No main in unittest
169 }
170 else
171 {
172     int main(string[] cmdArgs)
173     {
174         /* When running in DMD code coverage mode, turn on report merging. */
175         version(D_Coverage) version(DigitalMars)
176         {
177             import core.runtime : dmd_coverSetMerge;
178             dmd_coverSetMerge(true);
179         }
180 
181         Csv2tsvOptions cmdopt;
182         const r = cmdopt.processArgs(cmdArgs);
183         if (!r[0]) return r[1];
184         version(LDC_Profile)
185         {
186             import ldc.profile : resetAll;
187             resetAll();
188         }
189         try csv2tsvFiles(cmdopt, cmdArgs[1..$]);
190         catch (Exception exc)
191         {
192             writeln();
193             stdin.flush();
194             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
195             return 1;
196         }
197 
198         return 0;
199     }
200 }
201 
202 /* This uses a D feature where a type can reserve a single value to represent null. */
203 alias NullableSizeT = Nullable!(size_t, size_t.max);
204 
205 
206 /** csv2tsvFiles reads multiple files and standard input and writes the results to
207  * standard output.
208  */
209 void csv2tsvFiles(in Csv2tsvOptions cmdopt, in string[] inputFiles)
210 {
211     import std.algorithm : joiner;
212     import tsv_utils.common.utils : BufferedOutputRange;
213 
214     ubyte[1024 * 1024] fileRawBuf;
215     ubyte[] stdinRawBuf = fileRawBuf[0..1024];
216     auto stdoutWriter = BufferedOutputRange!(typeof(stdout))(stdout);
217     bool firstFile = true;
218 
219     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
220     {
221         auto ubyteChunkedStream = (filename == "-") ?
222             stdin.byChunk(stdinRawBuf) : filename.File.byChunk(fileRawBuf);
223         auto ubyteStream = ubyteChunkedStream.joiner;
224 
225         if (firstFile || !cmdopt.hasHeader)
226         {
227             csv2tsv(ubyteStream, stdoutWriter, filename, 0,
228                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
229                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
230         }
231         else
232         {
233             /* Don't write the header on subsequent files. Write the first
234              * record to a null sink instead.
235              */
236             auto nullWriter = NullSink();
237             csv2tsv(ubyteStream, nullWriter, filename, 0,
238                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
239                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement,
240                     NullableSizeT(1));
241             csv2tsv(ubyteStream, stdoutWriter, filename, 1,
242                     cmdopt.csvQuoteChar, cmdopt.csvDelimChar,
243                     cmdopt.tsvDelimChar, cmdopt.tsvDelimReplacement);
244         }
245         firstFile = false;
246     }
247 }
248 
249 /** Read CSV from an input source, covert to TSV and write to an output source.
250  *
251  * Params:
252  *   InputRange          =  A ubyte input range to read CSV text from. A ubyte range
253  *                          matched byChunck. It also avoids convesion to dchar by front().
254  *   OutputRange         =  An output range to write TSV text to.
255  *   filename            =  Name of file to use when reporting errors. A descriptive name
256  *                       =  can be used in lieu of a file name.
257  *   currFileLineNumber  =  First line being processed. Used when reporting errors. Needed
258  *                          only when part of the input has already been processed.
259  *   csvQuote            =  The quoting character used in the input CSV file.
260  *   csvDelim            =  The field delimiter character used in the input CSV file.
261  *   tsvDelim            =  The field delimiter character to use in the generated TSV file.
262  *   tsvDelimReplacement =  A string to use when replacing newlines and TSV field delimiters
263  *                          occurring in CSV fields.
264  *   maxRecords          =  The maximum number of records to process (output lines). This is
265  *                          intended to support processing the header line separately.
266  *
267  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
268  *         line number where the error was identified.
269  */
270 void csv2tsv(InputRange, OutputRange)
271     (auto ref InputRange inputStream, auto ref OutputRange outputStream,
272      string filename = "(none)", size_t currFileLineNumber = 0,
273      const char csvQuote = '"', const char csvDelim = ',', const char tsvDelim = '\t',
274      string tsvDelimReplacement = " ",
275      NullableSizeT maxRecords=NullableSizeT.init,
276      )
277 if (isInputRange!InputRange && isOutputRange!(OutputRange, char) &&
278     is(Unqual!(ElementType!InputRange) == ubyte))
279 {
280     enum State { FieldEnd, NonQuotedField, QuotedField, QuoteInQuotedField }
281 
282     State currState = State.FieldEnd;
283     size_t recordNum = 1;      // Record number. Output line number.
284     size_t fieldNum = 0;       // Field on current line.
285 
286 InputLoop: while (!inputStream.empty)
287     {
288         char nextChar = inputStream.front;
289         inputStream.popFront;
290 
291         if (nextChar == '\r')
292         {
293             /* Collapse newline cases to '\n'. */
294             if (!inputStream.empty && inputStream.front == '\n')
295             {
296                 inputStream.popFront;
297             }
298             nextChar = '\n';
299         }
300 
301     OuterSwitch: final switch (currState)
302         {
303         case State.FieldEnd:
304             /* Start of input or after consuming a field terminator. */
305             ++fieldNum;
306 
307             /* Note: Can't use a switch here do the 'goto case' to the OuterSwitch.  */
308             if (nextChar == csvQuote)
309             {
310                 currState = State.QuotedField;
311                 break OuterSwitch;
312             }
313             else
314             {
315                 /* Processing state change only. Don't consume the character. */
316                 currState = State.NonQuotedField;
317                 goto case State.NonQuotedField;
318             }
319 
320         case State.NonQuotedField:
321             switch (nextChar)
322             {
323             default:
324                 put(outputStream, nextChar);
325                 break OuterSwitch;
326             case csvDelim:
327                 put(outputStream, tsvDelim);
328                 currState = State.FieldEnd;
329                 break OuterSwitch;
330             case tsvDelim:
331                 put(outputStream, tsvDelimReplacement);
332                 break OuterSwitch;
333             case '\n':
334                 put(outputStream, '\n');
335                 ++recordNum;
336                 fieldNum = 0;
337                 currState = State.FieldEnd;
338                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
339                 else break OuterSwitch;
340             }
341 
342         case State.QuotedField:
343             switch (nextChar)
344             {
345             default:
346                 put(outputStream, nextChar);
347                 break OuterSwitch;
348             case csvQuote:
349                 /* Quote in a quoted field. Need to look at the next character.*/
350                 if (!inputStream.empty)
351                 {
352                     currState = State.QuoteInQuotedField;
353                 }
354                 else
355                 {
356                     /* End of input. A rare case: Quoted field on last line with no
357                      * following trailing newline. Reset the state to avoid triggering
358                      * an invalid quoted field exception, plus adding additional newline.
359                      */
360                     currState = State.FieldEnd;
361                 }
362                 break OuterSwitch;
363             case '\n':
364                 /* Newline in a quoted field. */
365                 put(outputStream, tsvDelimReplacement);
366                 break OuterSwitch;
367             case tsvDelim:
368                 put(outputStream, tsvDelimReplacement);
369                 break OuterSwitch;
370             }
371 
372         case State.QuoteInQuotedField:
373             /* Just processed a quote in a quoted field. */
374             switch (nextChar)
375             {
376             case csvQuote:
377                 put(outputStream, csvQuote);
378                 currState = State.QuotedField;
379                 break OuterSwitch;
380             case csvDelim:
381                 put(outputStream, tsvDelim);
382                 currState = State.FieldEnd;
383                 break OuterSwitch;
384             case '\n':
385                 put(outputStream, '\n');
386                 ++recordNum;
387                 fieldNum = 0;
388                 currState = State.FieldEnd;
389 
390                 if (!maxRecords.isNull && recordNum > maxRecords) break InputLoop;
391                 else break OuterSwitch;
392             default:
393                 throw new Exception(
394                     format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
395                            (filename == "-") ? "Standard Input" : filename,
396                            currFileLineNumber + recordNum));
397             }
398         }
399     }
400 
401     if (currState == State.QuotedField)
402     {
403         throw new Exception(
404             format("Invalid CSV. Improperly terminated quoted field. File: %s, Line: %d",
405                    (filename == "-") ? "Standard Input" : filename,
406                    currFileLineNumber + recordNum));
407     }
408 
409     if (fieldNum > 0) put(outputStream, '\n');    // Last line w/o terminating newline.
410 }
411 
412 unittest
413 {
414     /* Unit tests for the csv2tsv function.
415      *
416      * These unit tests exercise different CSV combinations and escaping cases. The CSV
417      * data content is the same for each corresponding test string, except the delimiters
418      * have been changed. e.g csv6a and csv6b have the same data content.
419      *
420      * A property used in these tests is that changing the CSV delimiters doesn't change
421      * the resulting TSV. However, changing the TSV delimiters will change the TSV result,
422      * as TSV doesn't support having it's delimiters in the data. This allows having a
423      * single TSV expected set that is generated by CSVs with different delimter sets.
424      *
425      * This test set does not test main, file handling, or error messages. These are
426      * handled by tests run against the executable.
427      */
428 
429     /* Default CSV. */
430     auto csv1a = "a,b,c";
431     auto csv2a = "a,bc,,,def";
432     auto csv3a = ",a, b , cd ,";
433     auto csv4a = "ß,ßÀß,あめりか物語,书名: 五色石";
434     auto csv5a = "\"\n\",\"\n\n\",\"\n\n\n\"";
435     auto csv6a = "\"\t\",\"\t\t\",\"\t\t\t\"";
436     auto csv7a = "\",\",\",,\",\",,,\"";
437     auto csv8a = "\"\",\"\"\"\",\"\"\"\"\"\"";
438     auto csv9a = "\"ab, de\tfg\"\"\nhij\"";
439     auto csv10a = "";
440     auto csv11a = ",";
441     auto csv12a = ",,";
442     auto csv13a = "\"\r\",\"\r\r\",\"\r\r\r\"";
443     auto csv14a = "\"\r\n\",\"\r\n\r\n\",\"\r\n\r\n\r\n\"";
444     auto csv15a = "\"ab, de\tfg\"\"\rhij\"";
445     auto csv16a = "\"ab, de\tfg\"\"\r\nhij\"";
446     auto csv17a = "ab\",ab\"cd";
447     auto csv18a = "\n\n\n";
448     auto csv19a = "\t";
449     auto csv20a = "\t\t";
450     auto csv21a = "a\n";
451     auto csv22a = "a,\n";
452     auto csv23a = "a,b\n";
453     auto csv24a = ",\n";
454     auto csv25a = "#";
455     auto csv26a = "^";
456     auto csv27a = "#^#";
457     auto csv28a = "^#^";
458     auto csv29a = "$";
459     auto csv30a = "$,$\n\"$\",\"$$\",$$\n^#$,$#^,#$^,^$#\n";
460     auto csv31a = "1-1\n2-1,2-2\n3-1,3-2,3-3\n\n,5-2\n,,6-3\n";
461     auto csv32a = ",1-2,\"1-3\"\n\"2-1\",\"2-2\",\n\"3-1\",,\"3-3\"";
462 
463     /* Set B has the same data and TSV results as set A, but uses # for quote and ^ for comma. */
464     auto csv1b = "a^b^c";
465     auto csv2b = "a^bc^^^def";
466     auto csv3b = "^a^ b ^ cd ^";
467     auto csv4b = "ß^ßÀß^あめりか物語^书名: 五色石";
468     auto csv5b = "#\n#^#\n\n#^#\n\n\n#";
469     auto csv6b = "#\t#^#\t\t#^#\t\t\t#";
470     auto csv7b = "#,#^#,,#^#,,,#";
471     auto csv8b = "##^#\"#^#\"\"#";
472     auto csv9b = "#ab, de\tfg\"\nhij#";
473     auto csv10b = "";
474     auto csv11b = "^";
475     auto csv12b = "^^";
476     auto csv13b = "#\r#^#\r\r#^#\r\r\r#";
477     auto csv14b = "#\r\n#^#\r\n\r\n#^#\r\n\r\n\r\n#";
478     auto csv15b = "#ab, de\tfg\"\rhij#";
479     auto csv16b = "#ab, de\tfg\"\r\nhij#";
480     auto csv17b = "ab\"^ab\"cd";
481     auto csv18b = "\n\n\n";
482     auto csv19b = "\t";
483     auto csv20b = "\t\t";
484     auto csv21b = "a\n";
485     auto csv22b = "a^\n";
486     auto csv23b = "a^b\n";
487     auto csv24b = "^\n";
488     auto csv25b = "####";
489     auto csv26b = "#^#";
490     auto csv27b = "###^###";
491     auto csv28b = "#^##^#";
492     auto csv29b = "$";
493     auto csv30b = "$^$\n#$#^#$$#^$$\n#^##$#^#$##^#^###$^#^#^$###\n";
494     auto csv31b = "1-1\n2-1^2-2\n3-1^3-2^3-3\n\n^5-2\n^^6-3\n";
495     auto csv32b = "^1-2^#1-3#\n#2-1#^#2-2#^\n#3-1#^^#3-3#";
496 
497     /* The expected results for csv sets A and B. This is for the default TSV delimiters.*/
498     auto tsv1 = "a\tb\tc\n";
499     auto tsv2 = "a\tbc\t\t\tdef\n";
500     auto tsv3 = "\ta\t b \t cd \t\n";
501     auto tsv4 = "ß\tßÀß\tあめりか物語\t书名: 五色石\n";
502     auto tsv5 = " \t  \t   \n";
503     auto tsv6 = " \t  \t   \n";
504     auto tsv7 = ",\t,,\t,,,\n";
505     auto tsv8 = "\t\"\t\"\"\n";
506     auto tsv9 = "ab, de fg\" hij\n";
507     auto tsv10 = "";
508     auto tsv11 = "\t\n";
509     auto tsv12 = "\t\t\n";
510     auto tsv13 = " \t  \t   \n";
511     auto tsv14 = " \t  \t   \n";
512     auto tsv15 = "ab, de fg\" hij\n";
513     auto tsv16 = "ab, de fg\" hij\n";
514     auto tsv17 = "ab\"\tab\"cd\n";
515     auto tsv18 = "\n\n\n";
516     auto tsv19 = " \n";
517     auto tsv20 = "  \n";
518     auto tsv21 = "a\n";
519     auto tsv22 = "a\t\n";
520     auto tsv23 = "a\tb\n";
521     auto tsv24 = "\t\n";
522     auto tsv25 = "#\n";
523     auto tsv26 = "^\n";
524     auto tsv27 = "#^#\n";
525     auto tsv28 = "^#^\n";
526     auto tsv29 = "$\n";
527     auto tsv30 = "$\t$\n$\t$$\t$$\n^#$\t$#^\t#$^\t^$#\n";
528     auto tsv31 = "1-1\n2-1\t2-2\n3-1\t3-2\t3-3\n\n\t5-2\n\t\t6-3\n";
529     auto tsv32 = "\t1-2\t1-3\n2-1\t2-2\t\n3-1\t\t3-3\n";
530 
531     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab.
532      * This will also result in different replacements when TAB and $ appear in the CSV.
533      */
534     auto tsv1_x = "a$b$c\n";
535     auto tsv2_x = "a$bc$$$def\n";
536     auto tsv3_x = "$a$ b $ cd $\n";
537     auto tsv4_x = "ß$ßÀß$あめりか物語$书名: 五色石\n";
538     auto tsv5_x = " $  $   \n";
539     auto tsv6_x = "\t$\t\t$\t\t\t\n";
540     auto tsv7_x = ",$,,$,,,\n";
541     auto tsv8_x = "$\"$\"\"\n";
542     auto tsv9_x = "ab, de\tfg\" hij\n";
543     auto tsv10_x = "";
544     auto tsv11_x = "$\n";
545     auto tsv12_x = "$$\n";
546     auto tsv13_x = " $  $   \n";
547     auto tsv14_x = " $  $   \n";
548     auto tsv15_x = "ab, de\tfg\" hij\n";
549     auto tsv16_x = "ab, de\tfg\" hij\n";
550     auto tsv17_x = "ab\"$ab\"cd\n";
551     auto tsv18_x = "\n\n\n";
552     auto tsv19_x = "\t\n";
553     auto tsv20_x = "\t\t\n";
554     auto tsv21_x = "a\n";
555     auto tsv22_x = "a$\n";
556     auto tsv23_x = "a$b\n";
557     auto tsv24_x = "$\n";
558     auto tsv25_x = "#\n";
559     auto tsv26_x = "^\n";
560     auto tsv27_x = "#^#\n";
561     auto tsv28_x = "^#^\n";
562     auto tsv29_x = " \n";
563     auto tsv30_x = " $ \n $  $  \n^# $ #^$# ^$^ #\n";
564     auto tsv31_x = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
565     auto tsv32_x = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
566 
567     /* The TSV results for CSV sets 1a and 1b, but with $ as the delimiter rather than tab,
568      * and with the delimiter/newline replacement string being |--|. Basically, newlines
569      * and '$' in the original data are replaced by |--|.
570      */
571     auto tsv1_y = "a$b$c\n";
572     auto tsv2_y = "a$bc$$$def\n";
573     auto tsv3_y = "$a$ b $ cd $\n";
574     auto tsv4_y = "ß$ßÀß$あめりか物語$书名: 五色石\n";
575     auto tsv5_y = "|--|$|--||--|$|--||--||--|\n";
576     auto tsv6_y = "\t$\t\t$\t\t\t\n";
577     auto tsv7_y = ",$,,$,,,\n";
578     auto tsv8_y = "$\"$\"\"\n";
579     auto tsv9_y = "ab, de\tfg\"|--|hij\n";
580     auto tsv10_y = "";
581     auto tsv11_y = "$\n";
582     auto tsv12_y = "$$\n";
583     auto tsv13_y = "|--|$|--||--|$|--||--||--|\n";
584     auto tsv14_y = "|--|$|--||--|$|--||--||--|\n";
585     auto tsv15_y = "ab, de\tfg\"|--|hij\n";
586     auto tsv16_y = "ab, de\tfg\"|--|hij\n";
587     auto tsv17_y = "ab\"$ab\"cd\n";
588     auto tsv18_y = "\n\n\n";
589     auto tsv19_y = "\t\n";
590     auto tsv20_y = "\t\t\n";
591     auto tsv21_y = "a\n";
592     auto tsv22_y = "a$\n";
593     auto tsv23_y = "a$b\n";
594     auto tsv24_y = "$\n";
595     auto tsv25_y = "#\n";
596     auto tsv26_y = "^\n";
597     auto tsv27_y = "#^#\n";
598     auto tsv28_y = "^#^\n";
599     auto tsv29_y = "|--|\n";
600     auto tsv30_y = "|--|$|--|\n|--|$|--||--|$|--||--|\n^#|--|$|--|#^$#|--|^$^|--|#\n";
601     auto tsv31_y = "1-1\n2-1$2-2\n3-1$3-2$3-3\n\n$5-2\n$$6-3\n";
602     auto tsv32_y = "$1-2$1-3\n2-1$2-2$\n3-1$$3-3\n";
603 
604     auto csvSet1a = [csv1a, csv2a, csv3a, csv4a, csv5a, csv6a, csv7a, csv8a, csv9a, csv10a,
605                      csv11a, csv12a, csv13a, csv14a, csv15a, csv16a, csv17a, csv18a, csv19a, csv20a,
606                      csv21a, csv22a, csv23a, csv24a, csv25a, csv26a, csv27a, csv28a, csv29a, csv30a,
607                      csv31a, csv32a];
608 
609     auto csvSet1b = [csv1b, csv2b, csv3b, csv4b, csv5b, csv6b, csv7b, csv8b, csv9b, csv10b,
610                      csv11b, csv12b, csv13b, csv14b, csv15b, csv16b, csv17b, csv18b, csv19b, csv20b,
611                      csv21b, csv22b, csv23b, csv24b, csv25b, csv26b, csv27b, csv28b, csv29b, csv30b,
612                      csv31b, csv32b];
613 
614     auto tsvSet1  = [tsv1, tsv2, tsv3, tsv4, tsv5, tsv6, tsv7, tsv8, tsv9, tsv10,
615                      tsv11, tsv12, tsv13, tsv14, tsv15, tsv16, tsv17, tsv18, tsv19, tsv20,
616                      tsv21, tsv22, tsv23, tsv24, tsv25, tsv26, tsv27, tsv28, tsv29, tsv30,
617                      tsv31, tsv32];
618 
619     auto tsvSet1_x  = [tsv1_x, tsv2_x, tsv3_x, tsv4_x, tsv5_x, tsv6_x, tsv7_x, tsv8_x, tsv9_x, tsv10_x,
620                        tsv11_x, tsv12_x, tsv13_x, tsv14_x, tsv15_x, tsv16_x, tsv17_x, tsv18_x, tsv19_x, tsv20_x,
621                        tsv21_x, tsv22_x, tsv23_x, tsv24_x, tsv25_x, tsv26_x, tsv27_x, tsv28_x, tsv29_x, tsv30_x,
622                        tsv31_x, tsv32_x];
623 
624     auto tsvSet1_y  = [tsv1_y, tsv2_y, tsv3_y, tsv4_y, tsv5_y, tsv6_y, tsv7_y, tsv8_y, tsv9_y, tsv10_y,
625                        tsv11_y, tsv12_y, tsv13_y, tsv14_y, tsv15_y, tsv16_y, tsv17_y, tsv18_y, tsv19_y, tsv20_y,
626                        tsv21_y, tsv22_y, tsv23_y, tsv24_y, tsv25_y, tsv26_y, tsv27_y, tsv28_y, tsv29_y, tsv30_y,
627                        tsv31_y, tsv32_y];
628 
629     foreach (i, csva, csvb, tsv, tsv_x, tsv_y; lockstep(csvSet1a, csvSet1b, tsvSet1, tsvSet1_x, tsvSet1_y))
630     {
631         import std.conv : to;
632 
633         /* Byte streams for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
634         ubyte[] csvInputA = cast(ubyte[])csva;
635         ubyte[] csvInputB = cast(ubyte[])csvb;
636 
637         /* CSV Set A vs TSV expected. */
638         auto tsvResultA = appender!(char[])();
639         csv2tsv(csvInputA, tsvResultA, "csvInputA_defaultTSV", i);
640         assert(tsv == tsvResultA.data,
641                format("Unittest failure. tsv != tsvResultA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
642                       i + 1, csva, tsv, tsvResultA.data));
643 
644         /* CSV Set B vs TSV expected. Different CSV delimiters, same TSV results as CSV Set A.*/
645         auto tsvResultB = appender!(char[])();
646         csv2tsv(csvInputB, tsvResultB, "csvInputB_defaultTSV", i, '#', '^');
647         assert(tsv == tsvResultB.data,
648                format("Unittest failure. tsv != tsvResultB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
649                       i + 1, csvb, tsv, tsvResultB.data));
650 
651         /* CSV Set A and TSV with $ separator.*/
652         csvInputA = cast(ubyte[])csva;
653         auto tsvResult_XA = appender!(char[])();
654         csv2tsv(csvInputA, tsvResult_XA, "csvInputA_TSV_WithDollarDelimiter", i, '"', ',', '$');
655         assert(tsv_x == tsvResult_XA.data,
656                format("Unittest failure. tsv_x != tsvResult_XA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
657                       i + 1, csva, tsv_x, tsvResult_XA.data));
658 
659         /* CSV Set B and TSV with $ separator. Same TSV results as CSV Set A.*/
660         csvInputB = cast(ubyte[])csvb;
661         auto tsvResult_XB = appender!(char[])();
662         csv2tsv(csvInputB, tsvResult_XB, "csvInputB__TSV_WithDollarDelimiter", i, '#', '^', '$');
663         assert(tsv_x == tsvResult_XB.data,
664                format("Unittest failure. tsv_x != tsvResult_XB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
665                       i + 1, csvb, tsv_x, tsvResult_XB.data));
666 
667         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. */
668         csvInputA = cast(ubyte[])csva;
669         auto tsvResult_YA = appender!(char[])();
670         csv2tsv(csvInputA, tsvResult_YA, "csvInputA_TSV_WithDollarAndDelimReplacement", i, '"', ',', '$', "|--|");
671         assert(tsv_y == tsvResult_YA.data,
672                format("Unittest failure. tsv_y != tsvResult_YA.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
673                       i + 1, csva, tsv_y, tsvResult_YA.data));
674 
675         /* CSV Set A and TSV with $ separator and tsv delimiter/newline replacement. Same TSV as CSV Set A.*/
676         csvInputB = cast(ubyte[])csvb;
677         auto tsvResult_YB = appender!(char[])();
678         csv2tsv(csvInputB, tsvResult_YB, "csvInputB__TSV_WithDollarAndDelimReplacement", i, '#', '^', '$', "|--|");
679         assert(tsv_y == tsvResult_YB.data,
680                format("Unittest failure. tsv_y != tsvResult_YB.data.  Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
681                       i + 1, csvb, tsv_y, tsvResult_YB.data));
682 
683     }
684 }
685 
686 unittest
687 {
688     /* Unit tests for 'maxRecords' feature of the csv2tsv function.
689      */
690 
691     /* Input CSV. */
692     auto csv1 = "";
693     auto csv2 = ",";
694     auto csv3 = "a";
695     auto csv4 = "a\n";
696     auto csv5 = "a\nb";
697     auto csv6 = "a\nb\n";
698     auto csv7 = "a\nb\nc";
699     auto csv8 = "a\nb\nc\n";
700     auto csv9 = "a,aa";
701     auto csv10 = "a,aa\n";
702     auto csv11 = "a,aa\nb,bb";
703     auto csv12 = "a,aa\nb,bb\n";
704     auto csv13 = "a,aa\nb,bb\nc,cc";
705     auto csv14 = "a,aa\nb,bb\nc,cc\n";
706 
707     auto csv15 = "\"a\",\"aa\"";
708     auto csv16 = "\"a\",\"aa\"\n";
709     auto csv17 = "\"a\",\"aa\"\n\"b\",\"bb\"";
710     auto csv18 = "\"a\",\"aa\"\n\"b\",\"bb\"\n";
711     auto csv19 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"";
712     auto csv20 = "\"a\",\"aa\"\n\"b\",\"bb\"\n\"c\",\"cc\"\n";
713 
714     /* TSV with max 1 record. */
715     auto tsv1_max1 = "";
716     auto tsv2_max1 = "\t\n";
717     auto tsv3_max1 = "a\n";
718     auto tsv4_max1 = "a\n";
719     auto tsv5_max1 = "a\n";
720     auto tsv6_max1 = "a\n";
721     auto tsv7_max1 = "a\n";
722     auto tsv8_max1 = "a\n";
723     auto tsv9_max1 = "a\taa\n";
724     auto tsv10_max1 = "a\taa\n";
725     auto tsv11_max1 = "a\taa\n";
726     auto tsv12_max1 = "a\taa\n";
727     auto tsv13_max1 = "a\taa\n";
728     auto tsv14_max1 = "a\taa\n";
729 
730     auto tsv15_max1 = "a\taa\n";
731     auto tsv16_max1 = "a\taa\n";
732     auto tsv17_max1 = "a\taa\n";
733     auto tsv18_max1 = "a\taa\n";
734     auto tsv19_max1 = "a\taa\n";
735     auto tsv20_max1 = "a\taa\n";
736 
737     /* Remaining TSV converted after first call. */
738     auto tsv1_max1_rest = "";
739     auto tsv2_max1_rest = "";
740     auto tsv3_max1_rest = "";
741     auto tsv4_max1_rest = "";
742     auto tsv5_max1_rest = "b\n";
743     auto tsv6_max1_rest = "b\n";
744     auto tsv7_max1_rest = "b\nc\n";
745     auto tsv8_max1_rest = "b\nc\n";
746     auto tsv9_max1_rest = "";
747     auto tsv10_max1_rest = "";
748     auto tsv11_max1_rest = "b\tbb\n";
749     auto tsv12_max1_rest = "b\tbb\n";
750     auto tsv13_max1_rest = "b\tbb\nc\tcc\n";
751     auto tsv14_max1_rest = "b\tbb\nc\tcc\n";
752 
753     auto tsv15_max1_rest = "";
754     auto tsv16_max1_rest = "";
755     auto tsv17_max1_rest = "b\tbb\n";
756     auto tsv18_max1_rest = "b\tbb\n";
757     auto tsv19_max1_rest = "b\tbb\nc\tcc\n";
758     auto tsv20_max1_rest = "b\tbb\nc\tcc\n";
759 
760     /* TSV with max 2 records. */
761     auto tsv1_max2 = "";
762     auto tsv2_max2 = "\t\n";
763     auto tsv3_max2 = "a\n";
764     auto tsv4_max2 = "a\n";
765     auto tsv5_max2 = "a\nb\n";
766     auto tsv6_max2 = "a\nb\n";
767     auto tsv7_max2 = "a\nb\n";
768     auto tsv8_max2 = "a\nb\n";
769     auto tsv9_max2 = "a\taa\n";
770     auto tsv10_max2 = "a\taa\n";
771     auto tsv11_max2 = "a\taa\nb\tbb\n";
772     auto tsv12_max2 = "a\taa\nb\tbb\n";
773     auto tsv13_max2 = "a\taa\nb\tbb\n";
774     auto tsv14_max2 = "a\taa\nb\tbb\n";
775 
776     auto tsv15_max2 = "a\taa\n";
777     auto tsv16_max2 = "a\taa\n";
778     auto tsv17_max2 = "a\taa\nb\tbb\n";
779     auto tsv18_max2 = "a\taa\nb\tbb\n";
780     auto tsv19_max2 = "a\taa\nb\tbb\n";
781     auto tsv20_max2 = "a\taa\nb\tbb\n";
782 
783     /* Remaining TSV converted after first call. */
784     auto tsv1_max2_rest = "";
785     auto tsv2_max2_rest = "";
786     auto tsv3_max2_rest = "";
787     auto tsv4_max2_rest = "";
788     auto tsv5_max2_rest = "";
789     auto tsv6_max2_rest = "";
790     auto tsv7_max2_rest = "c\n";
791     auto tsv8_max2_rest = "c\n";
792     auto tsv9_max2_rest = "";
793     auto tsv10_max2_rest = "";
794     auto tsv11_max2_rest = "";
795     auto tsv12_max2_rest = "";
796     auto tsv13_max2_rest = "c\tcc\n";
797     auto tsv14_max2_rest = "c\tcc\n";
798 
799     auto tsv15_max2_rest = "";
800     auto tsv16_max2_rest = "";
801     auto tsv17_max2_rest = "";
802     auto tsv18_max2_rest = "";
803     auto tsv19_max2_rest = "c\tcc\n";
804     auto tsv20_max2_rest = "c\tcc\n";
805 
806     auto csvSet1 =
807         [csv1, csv2, csv3, csv4, csv5, csv6, csv7,
808          csv8, csv9, csv10, csv11, csv12, csv13, csv14,
809          csv15, csv16, csv17, csv18, csv19, csv20 ];
810 
811     auto tsvMax1Set1 =
812         [tsv1_max1, tsv2_max1, tsv3_max1, tsv4_max1, tsv5_max1, tsv6_max1, tsv7_max1,
813          tsv8_max1, tsv9_max1, tsv10_max1, tsv11_max1, tsv12_max1, tsv13_max1, tsv14_max1,
814          tsv15_max1, tsv16_max1, tsv17_max1, tsv18_max1, tsv19_max1, tsv20_max1];
815 
816     auto tsvMax1RestSet1 =
817         [tsv1_max1_rest, tsv2_max1_rest, tsv3_max1_rest, tsv4_max1_rest, tsv5_max1_rest, tsv6_max1_rest, tsv7_max1_rest,
818          tsv8_max1_rest, tsv9_max1_rest, tsv10_max1_rest, tsv11_max1_rest, tsv12_max1_rest, tsv13_max1_rest, tsv14_max1_rest,
819          tsv15_max1_rest, tsv16_max1_rest, tsv17_max1_rest, tsv18_max1_rest, tsv19_max1_rest, tsv20_max1_rest];
820 
821     auto tsvMax2Set1 =
822         [tsv1_max2, tsv2_max2, tsv3_max2, tsv4_max2, tsv5_max2, tsv6_max2, tsv7_max2,
823          tsv8_max2, tsv9_max2, tsv10_max2, tsv11_max2, tsv12_max2, tsv13_max2, tsv14_max2,
824          tsv15_max2, tsv16_max2, tsv17_max2, tsv18_max2, tsv19_max2, tsv20_max2];
825 
826     auto tsvMax2RestSet1 =
827         [tsv1_max2_rest, tsv2_max2_rest, tsv3_max2_rest, tsv4_max2_rest, tsv5_max2_rest, tsv6_max2_rest, tsv7_max2_rest,
828          tsv8_max2_rest, tsv9_max2_rest, tsv10_max2_rest, tsv11_max2_rest, tsv12_max2_rest, tsv13_max2_rest, tsv14_max2_rest,
829          tsv15_max2_rest, tsv16_max2_rest, tsv17_max2_rest, tsv18_max2_rest, tsv19_max2_rest, tsv20_max2_rest];
830 
831     foreach (i, csv, tsv_max1, tsv_max1_rest, tsv_max2, tsv_max2_rest;
832              lockstep(csvSet1, tsvMax1Set1, tsvMax1RestSet1, tsvMax2Set1, tsvMax2RestSet1))
833     {
834         /* Byte stream for csv2tsv. Consumed by csv2tsv, so need to be reset when re-used. */
835         ubyte[] csvInput = cast(ubyte[])csv;
836 
837         /* Call with maxRecords == 1. */
838         auto tsvMax1Result = appender!(char[])();
839         csv2tsv(csvInput, tsvMax1Result, "maxRecords-one", i, '"', ',', '\t', " ", NullableSizeT(1));
840         assert(tsv_max1 == tsvMax1Result.data,
841                format("Unittest failure. tsv_max1 != tsvMax1Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
842                       i + 1, csv, tsv_max1, tsvMax1Result.data));
843 
844         /* Follow-up call getting all records remaining after the maxRecords==1 call. */
845         auto tsvMax1RestResult = appender!(char[])();
846         csv2tsv(csvInput, tsvMax1RestResult, "maxRecords-one-followup", i);
847         assert(tsv_max1_rest == tsvMax1RestResult.data,
848                format("Unittest failure. tsv_max1_rest != tsvMax1RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
849                       i + 1, csv, tsv_max1_rest, tsvMax1RestResult.data));
850 
851         /* Reset the input stream for maxRecords == 2. */
852         csvInput = cast(ubyte[])csv;
853 
854         /* Call with maxRecords == 2. */
855         auto tsvMax2Result = appender!(char[])();
856         csv2tsv(csvInput, tsvMax2Result, "maxRecords-two", i, '"', ',', '\t', " ", NullableSizeT(2));
857         assert(tsv_max2 == tsvMax2Result.data,
858                format("Unittest failure. tsv_max2 != tsvMax2Result.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
859                       i + 1, csv, tsv_max2, tsvMax2Result.data));
860 
861         /* Follow-up call getting all records remaining after the maxRecords==2 call. */
862         auto tsvMax2RestResult = appender!(char[])();
863         csv2tsv(csvInput, tsvMax2RestResult, "maxRecords-two-followup", i);
864         assert(tsv_max2_rest == tsvMax2RestResult.data,
865                format("Unittest failure. tsv_max2_rest != tsvMax2RestResult.data. Test: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
866                       i + 1, csv, tsv_max2_rest, tsvMax2RestResult.data));
867     }
868 }