1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_append;
11 
12 import std.conv : to;
13 import std.exception : enforce;
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program. Invokes command line arg processing and tsv-append to perform
27      * the real work. Any errors are caught and reported.
28      */
29     int main(string[] cmdArgs)
30     {
31         import tsv_utils.common.utils : BufferedOutputRange;
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvAppendOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
43         catch (Exception exc)
44         {
45             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
46             return 1;
47         }
48         return 0;
49     }
50 }
51 
52 auto helpTextVerbose = q"EOS
53 Synopsis: tsv-append [options] [file...]
54 
55 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
56 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
57 the first file. It also supports source tracking, adding a column indicating
58 the original file to each row. Results are written to standard output.
59 
60 Concatenation with header support is useful when preparing data for traditional
61 Unix utilities like 'sort' and 'sed' or applications that read a single file.
62 
63 Source tracking is useful when creating long/narrow form tabular data, a format
64 used by many statistics and data mining packages. In this scenario, files have
65 been used to capture related data sets, the difference between data sets being a
66 condition represented by the file. For example, results from different variants
67 of an experiment might each be recorded in their own files. Retaining the source
68 file as an output column preserves the condition represented by the file.
69 
70 The file-name (without extension) is used as the source value. This can
71 customized using the --f|file option.
72 
73 Example: Header processing:
74 
75    $ tsv-append -H file1.tsv file2.tsv file3.tsv
76 
77 Example: Header processing and source tracking:
78 
79    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
80 
81 Example: Source tracking with custom values:
82 
83    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
84 
85 Options:
86 EOS";
87 
88 auto helpText = q"EOS
89 Synopsis: tsv-append [options] [file...]
90 
91 tsv-append concatenates multiple TSV files, reading from files or standard input
92 and writing to standard output. It is header aware ('--H|header'), writing the
93 header from only the first file. It also supports source tracking, adding an
94 indicator of original file to each row of input.
95 
96 Options:
97 EOS";
98 
99 /** Container for command line options.
100 */
101 struct TsvAppendOptions
102 {
103     string programName;
104     string[] files;                    /// Input files
105     string[string] fileSourceNames;    /// Maps file path to the 'source' value
106     string sourceHeader;               /// --s|source-header
107     bool trackSource = false;          /// --t|track-source
108     bool hasHeader = false;            /// --H|header
109     char delim = '\t';                 /// --d|delimiter
110 
111     /* fileOptionHandler processes the '--f|file source=file' option. */
112     private void fileOptionHandler(string option, string optionVal) pure @safe
113     {
114         import std.algorithm : findSplit;
115         import std.format : format;
116 
117         auto valSplit = findSplit(optionVal, "=");
118 
119         enforce(!valSplit[0].empty && !valSplit[2].empty,
120                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
121                        option, optionVal, option));
122 
123         auto source = valSplit[0];
124         auto filepath = valSplit[2];
125         files ~= filepath;
126         fileSourceNames[filepath] = source;
127     }
128 
129     /** Command line argument processing.
130      *
131      * Returns a tuple. First value is true if command line arguments were successfully
132      * processed and execution should continue, or false if an error occurred or the user
133      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
134      *
135      * Returning true (execution continues) means args have been validated and derived
136      * values calculated. In addition, field indices have been converted to zero-based.
137      * If the whole line is the key, the individual fields list will be cleared.
138      */
139     auto processArgs (ref string[] cmdArgs)
140     {
141         import std.algorithm : any, each;
142         import std.getopt;
143         import std.path : baseName, stripExtension;
144 
145         bool helpVerbose = false;          // --help-verbose
146         bool versionWanted = false;        // --V|version
147 
148         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
149 
150         try
151         {
152             arraySep = ",";    // Use comma to separate values in command line options
153             auto r = getopt(
154                 cmdArgs,
155                 "help-verbose",    "          Print full help.", &helpVerbose,
156                 std.getopt.config.caseSensitive,
157                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
158                 std.getopt.config.caseInsensitive,
159                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
160                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
161                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
162                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
163                 std.getopt.config.caseSensitive,
164                 "V|version",       "          Print version information and exit.", &versionWanted,
165                 std.getopt.config.caseInsensitive,
166                 );
167 
168             if (r.helpWanted)
169             {
170                 defaultGetoptPrinter(helpText, r.options);
171                 return tuple(false, 0);
172             }
173             else if (helpVerbose)
174             {
175                 defaultGetoptPrinter(helpTextVerbose, r.options);
176                 return tuple(false, 0);
177             }
178             else if (versionWanted)
179             {
180                 import tsv_utils.common.tsvutils_version;
181                 writeln(tsvutilsVersionNotice("tsv-append"));
182                 return tuple(false, 0);
183             }
184 
185             /* Derivations and consistency checks. */
186             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
187             if (!sourceHeader.empty) hasHeader = true;
188             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
189 
190             /* Assume the remaing arguments are filepaths. */
191             foreach (fp; cmdArgs[1 .. $])
192             {
193                 import std.path : baseName, stripExtension;
194                 files ~= fp;
195                 fileSourceNames[fp] = fp.stripExtension.baseName;
196             }
197 
198             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
199             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
200         }
201         catch (Exception exc)
202         {
203             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
204             return tuple(false, 1);
205         }
206         return tuple(true, 0);
207     }
208 }
209 
210 /** tsvAppend implements the basic functionality of the tsv-append program.
211  */
212 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
213 if (isOutputRange!(OutputRange, char))
214 {
215     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange;
216 
217     bool headerWritten = false;
218     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
219     {
220         auto inputStream = (filename == "-") ? stdin : filename.File();
221         auto sourceName = cmdopt.fileSourceNames[filename];
222         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
223         {
224             if (cmdopt.hasHeader && fileLineNum == 1)
225             {
226                 if (!headerWritten)
227                 {
228                     if (cmdopt.trackSource)
229                     {
230                         outputStream.put(cmdopt.sourceHeader);
231                         outputStream.put(cmdopt.delim);
232                     }
233                     outputStream.put(line);
234                     outputStream.put('\n');
235                     headerWritten = true;
236 
237                     /* Flush the header immediately. This helps tasks further on in a
238                      * unix pipeline detect errors quickly, without waiting for all
239                      * the data to flow through the pipeline. Note that an upstream
240                      * task may have flushed its header line, so the header may
241                      * arrive long before the main block of data.
242                      */
243                     static if (isFlushableOutputRange!OutputRange) outputStream.flush;
244                 }
245             }
246             else
247             {
248                 if (cmdopt.trackSource)
249                 {
250                     outputStream.put(sourceName);
251                     outputStream.put(cmdopt.delim);
252                 }
253                 outputStream.put(line);
254                 outputStream.put('\n');
255             }
256         }
257     }
258 }
259 
260 version(unittest)
261 {
262     /* Unit test helper functions. */
263 
264     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
265 
266     void testTsvAppend(string[] cmdArgs, string[][] expected)
267     {
268         import std.array : appender;
269         import std.format : format;
270 
271         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
272 
273         auto formatAssertMessage(T...)(string msg, T formatArgs)
274         {
275             auto formatString = "[testTsvAppend] %s: " ~ msg;
276             return format(formatString, cmdArgs[0], formatArgs);
277         }
278 
279         TsvAppendOptions cmdopt;
280         auto savedCmdArgs = cmdArgs.to!string;
281         auto r = cmdopt.processArgs(cmdArgs);
282         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
283 
284         auto output = appender!(char[])();
285         tsvAppend(cmdopt, output);
286         auto expectedOutput = expected.tsvDataToString;
287 
288         assert(output.data == expectedOutput,
289                formatAssertMessage(
290                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
291                    expectedOutput.to!string, output.data.to!string));
292     }
293  }
294 
295 unittest
296 {
297     import std.path : buildPath;
298     import std.file : rmdirRecurse;
299     import std.format : format;
300 
301     auto testDir = makeUnittestTempDir("tsv_append");
302     scope(exit) testDir.rmdirRecurse;
303 
304     string[][] data1 =
305         [["field_a", "field_b", "field_c"],
306          ["red", "17", "κόκκινος"],
307          ["blue", "12", "άσπρο"]];
308 
309     string[][] data2 =
310         [["field_a", "field_b", "field_c"],
311          ["green", "13.5", "κόκκινος"],
312          ["blue", "15", "πράσινος"]];
313 
314     string[][] data3 =
315         [["field_a", "field_b", "field_c"],
316          ["yellow", "9", "κίτρινος"]];
317 
318     string[][] dataHeaderRowOnly =
319         [["field_a", "field_b", "field_c"]];
320 
321     string[][] dataEmpty = [[]];
322 
323     string filepath1 = buildPath(testDir, "file1.tsv");
324     string filepath2 = buildPath(testDir, "file2.tsv");
325     string filepath3 = buildPath(testDir, "file3.tsv");
326     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
327     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
328 
329     writeUnittestTsvFile(filepath1, data1);
330     writeUnittestTsvFile(filepath2, data2);
331     writeUnittestTsvFile(filepath3, data3);
332     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
333     writeUnittestTsvFile(filepathEmpty, dataEmpty);
334 
335     testTsvAppend(["test-1", filepath1], data1);
336     testTsvAppend(["test-2", "--header", filepath1], data1);
337     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
338 
339     testTsvAppend(["test-4", "--header", filepath1, filepath2],
340                   [["field_a", "field_b", "field_c"],
341                    ["red", "17", "κόκκινος"],
342                    ["blue", "12", "άσπρο"],
343                    ["green", "13.5", "κόκκινος"],
344                    ["blue", "15", "πράσινος"]]);
345 
346     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
347                   [["field_a", "field_b", "field_c"],
348                    ["red", "17", "κόκκινος"],
349                    ["blue", "12", "άσπρο"],
350                    ["green", "13.5", "κόκκινος"],
351                    ["blue", "15", "πράσινος"],
352                    ["yellow", "9", "κίτρινος"]]);
353 
354     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
355                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
356 
357     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
358                   [["field_a", "field_b", "field_c"],
359                    ["red", "17", "κόκκινος"],
360                    ["blue", "12", "άσπρο"],
361                    ["green", "13.5", "κόκκινος"],
362                    ["blue", "15", "πράσινος"],
363                    ["yellow", "9", "κίτρινος"]]);
364 
365     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
366                   [["file1", "field_a", "field_b", "field_c"],
367                    ["file1", "red", "17", "κόκκινος"],
368                    ["file1", "blue", "12", "άσπρο"],
369                    ["file2", "field_a", "field_b", "field_c"],
370                    ["file2", "green", "13.5", "κόκκινος"],
371                    ["file2", "blue", "15", "πράσινος"]]);
372 
373     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
374                   [["file", "field_a", "field_b", "field_c"],
375                    ["file1", "red", "17", "κόκκινος"],
376                    ["file1", "blue", "12", "άσπρο"],
377                    ["file2", "green", "13.5", "κόκκινος"],
378                    ["file2", "blue", "15", "πράσινος"]]);
379 
380     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
381                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
382                   [["source", "field_a", "field_b", "field_c"],
383                    ["file1", "red", "17", "κόκκινος"],
384                    ["file1", "blue", "12", "άσπρο"],
385                    ["file2", "green", "13.5", "κόκκινος"],
386                    ["file2", "blue", "15", "πράσινος"],
387                    ["file3", "yellow", "9", "κίτρινος"]]);
388 
389     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
390                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
391                   [["id", "field_a", "field_b", "field_c"],
392                    ["1a", "red", "17", "κόκκινος"],
393                    ["1a", "blue", "12", "άσπρο"],
394                    ["1b", "green", "13.5", "κόκκινος"],
395                    ["1b", "blue", "15", "πράσινος"],
396                    ["1c", "yellow", "9", "κίτρινος"]]);
397 
398     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
399                    "-f", format("1b=%s", filepath2), filepath3],
400                   [["id", "field_a", "field_b", "field_c"],
401                    ["1a", "red", "17", "κόκκινος"],
402                    ["1a", "blue", "12", "άσπρο"],
403                    ["1b", "green", "13.5", "κόκκινος"],
404                    ["1b", "blue", "15", "πράσινος"],
405                    ["file3", "yellow", "9", "κίτρινος"]]);
406 }