1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2019, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_append;
11 
12 import std.conv : to;
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple;
16 
17 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
18 
19 version(unittest)
20 {
21     // When running unit tests, use main from -main compiler switch.
22 }
23 else
24 {
25     /** Main program. Invokes command line arg processing and tsv-append to perform
26      * the real work. Any errors are caught and reported.
27      */
28     int main(string[] cmdArgs)
29     {
30         import tsv_utils.common.utils : BufferedOutputRange;
31         /* When running in DMD code coverage mode, turn on report merging. */
32         version(D_Coverage) version(DigitalMars)
33         {
34             import core.runtime : dmd_coverSetMerge;
35             dmd_coverSetMerge(true);
36         }
37 
38         TsvAppendOptions cmdopt;
39         auto r = cmdopt.processArgs(cmdArgs);
40         if (!r[0]) return r[1];
41         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
42         catch (Exception exc)
43         {
44             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
45             return 1;
46         }
47         return 0;
48     }
49 }
50 
51 auto helpTextVerbose = q"EOS
52 Synopsis: tsv-append [options] [file...]
53 
54 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
55 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
56 the first file. It also supports source tracking, adding a column indicating
57 the original file to each row. Results are written to standard output.
58 
59 Concatenation with header support is useful when preparing data for traditional
60 Unix utilities like 'sort' and 'sed' or applications that read a single file.
61 
62 Source tracking is useful when creating long/narrow form tabular data, a format
63 used by many statistics and data mining packages. In this scenario, files have
64 been used to capture related data sets, the difference between data sets being a
65 condition represented by the file. For example, results from different variants
66 of an experiment might each be recorded in their own files. Retaining the source
67 file as an output column preserves the condition represented by the file.
68 
69 The file-name (without extension) is used as the source value. This can
70 customized using the --f|file option.
71 
72 Example: Header processing:
73 
74    $ tsv-append -H file1.tsv file2.tsv file3.tsv
75 
76 Example: Header processing and source tracking:
77 
78    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
79 
80 Example: Source tracking with custom values:
81 
82    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
83 
84 Options:
85 EOS";
86 
87 auto helpText = q"EOS
88 Synopsis: tsv-append [options] [file...]
89 
90 tsv-append concatenates multiple TSV files, reading from files or standard input
91 and writing to standard output. It is header aware ('--H|header'), writing the
92 header from only the first file. It also supports source tracking, adding an
93 indicator of original file to each row of input.
94 
95 Options:
96 EOS";
97 
98 /** Container for command line options.
99 */
100 struct TsvAppendOptions
101 {
102     string programName;
103     string[] files;                    // Input files
104     string[string] fileSourceNames;    // Maps file path to the 'source' value
105     bool helpVerbose = false;          // --help-verbose
106     string sourceHeader;               // --s|source-header
107     bool trackSource = false;          // --t|track-source
108     bool hasHeader = false;            // --H|header
109     char delim = '\t';                 // --d|delimiter
110     bool versionWanted = false;        // --V|version
111 
112     /* fileOptionHandler processes the '--f|file source=file' option. */
113     private void fileOptionHandler(string option, string optionVal)
114     {
115         import std.algorithm : findSplit;
116         import std.format : format;
117 
118         auto valSplit = findSplit(optionVal, "=");
119         if (valSplit[0].empty || valSplit[2].empty)
120             throw new Exception(
121                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
122                        option, optionVal, option));
123 
124         auto source = valSplit[0];
125         auto filepath = valSplit[2];
126         files ~= filepath;
127         fileSourceNames[filepath] = source;
128     }
129 
130     /** Command line argument processing.
131      *
132      * Returns a tuple. First value is true if command line arguments were successfully
133      * processed and execution should continue, or false if an error occurred or the user
134      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
135      *
136      * Returning true (execution continues) means args have been validated and derived
137      * values calculated. In addition, field indices have been converted to zero-based.
138      * If the whole line is the key, the individual fields list will be cleared.
139      */
140     auto processArgs (ref string[] cmdArgs)
141     {
142         import std.algorithm : any, each;
143         import std.getopt;
144         import std.path : baseName, stripExtension;
145 
146         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
147 
148         try
149         {
150             arraySep = ",";    // Use comma to separate values in command line options
151             auto r = getopt(
152                 cmdArgs,
153                 "help-verbose",    "          Print full help.", &helpVerbose,
154                 std.getopt.config.caseSensitive,
155                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
156                 std.getopt.config.caseInsensitive,
157                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
158                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
159                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
160                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
161                 std.getopt.config.caseSensitive,
162                 "V|version",       "          Print version information and exit.", &versionWanted,
163                 std.getopt.config.caseInsensitive,
164                 );
165 
166             if (r.helpWanted)
167             {
168                 defaultGetoptPrinter(helpText, r.options);
169                 return tuple(false, 0);
170             }
171             else if (helpVerbose)
172             {
173                 defaultGetoptPrinter(helpTextVerbose, r.options);
174                 return tuple(false, 0);
175             }
176             else if (versionWanted)
177             {
178                 import tsv_utils.common.tsvutils_version;
179                 writeln(tsvutilsVersionNotice("tsv-append"));
180                 return tuple(false, 0);
181             }
182 
183             /* Derivations and consistency checks. */
184             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
185             if (!sourceHeader.empty) hasHeader = true;
186             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
187 
188             /* Assume the remaing arguments are filepaths. */
189             foreach (fp; cmdArgs[1 .. $])
190             {
191                 import std.path : baseName, stripExtension;
192                 files ~= fp;
193                 fileSourceNames[fp] = fp.stripExtension.baseName;
194             }
195 
196             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
197             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
198         }
199         catch (Exception exc)
200         {
201             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
202             return tuple(false, 1);
203         }
204         return tuple(true, 0);
205     }
206 }
207 
208 /** tsvAppend implements the basic functionality of the tsv-append program.
209  */
210 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
211 if (isOutputRange!(OutputRange, char))
212 {
213     import tsv_utils.common.utils : bufferedByLine;
214 
215     bool headerWritten = false;
216     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
217     {
218         auto inputStream = (filename == "-") ? stdin : filename.File();
219         auto sourceName = cmdopt.fileSourceNames[filename];
220         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
221         {
222             if (cmdopt.hasHeader && fileLineNum == 1)
223             {
224                 if (!headerWritten)
225                 {
226                     if (cmdopt.trackSource)
227                     {
228                         outputStream.put(cmdopt.sourceHeader);
229                         outputStream.put(cmdopt.delim);
230                     }
231                     outputStream.put(line);
232                     outputStream.put('\n');
233                     headerWritten = true;
234                 }
235             }
236             else
237             {
238                 if (cmdopt.trackSource)
239                 {
240                     outputStream.put(sourceName);
241                     outputStream.put(cmdopt.delim);
242                 }
243                 outputStream.put(line);
244                 outputStream.put('\n');
245             }
246         }
247     }
248 }
249 
250 version(unittest)
251 {
252     /* Unit test helper functions. */
253 
254     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
255 
256     void testTsvAppend(string[] cmdArgs, string[][] expected)
257     {
258         import std.array : appender;
259         import std.format : format;
260 
261         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
262 
263         auto formatAssertMessage(T...)(string msg, T formatArgs)
264         {
265             auto formatString = "[testTsvAppend] %s: " ~ msg;
266             return format(formatString, cmdArgs[0], formatArgs);
267         }
268 
269         TsvAppendOptions cmdopt;
270         auto savedCmdArgs = cmdArgs.to!string;
271         auto r = cmdopt.processArgs(cmdArgs);
272         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
273 
274         auto output = appender!(char[])();
275         tsvAppend(cmdopt, output);
276         auto expectedOutput = expected.tsvDataToString;
277 
278         assert(output.data == expectedOutput,
279                formatAssertMessage(
280                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
281                    expectedOutput.to!string, output.data.to!string));
282     }
283  }
284 
285 unittest
286 {
287     import std.path : buildPath;
288     import std.file : rmdirRecurse;
289     import std.format : format;
290 
291     auto testDir = makeUnittestTempDir("tsv_append");
292     scope(exit) testDir.rmdirRecurse;
293 
294     string[][] data1 =
295         [["field_a", "field_b", "field_c"],
296          ["red", "17", "κόκκινος"],
297          ["blue", "12", "άσπρο"]];
298 
299     string[][] data2 =
300         [["field_a", "field_b", "field_c"],
301          ["green", "13.5", "κόκκινος"],
302          ["blue", "15", "πράσινος"]];
303 
304     string[][] data3 =
305         [["field_a", "field_b", "field_c"],
306          ["yellow", "9", "κίτρινος"]];
307 
308     string[][] dataHeaderRowOnly =
309         [["field_a", "field_b", "field_c"]];
310 
311     string[][] dataEmpty = [[]];
312 
313     string filepath1 = buildPath(testDir, "file1.tsv");
314     string filepath2 = buildPath(testDir, "file2.tsv");
315     string filepath3 = buildPath(testDir, "file3.tsv");
316     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
317     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
318 
319     writeUnittestTsvFile(filepath1, data1);
320     writeUnittestTsvFile(filepath2, data2);
321     writeUnittestTsvFile(filepath3, data3);
322     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
323     writeUnittestTsvFile(filepathEmpty, dataEmpty);
324 
325     testTsvAppend(["test-1", filepath1], data1);
326     testTsvAppend(["test-2", "--header", filepath1], data1);
327     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
328 
329     testTsvAppend(["test-4", "--header", filepath1, filepath2],
330                   [["field_a", "field_b", "field_c"],
331                    ["red", "17", "κόκκινος"],
332                    ["blue", "12", "άσπρο"],
333                    ["green", "13.5", "κόκκινος"],
334                    ["blue", "15", "πράσινος"]]);
335 
336     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
337                   [["field_a", "field_b", "field_c"],
338                    ["red", "17", "κόκκινος"],
339                    ["blue", "12", "άσπρο"],
340                    ["green", "13.5", "κόκκινος"],
341                    ["blue", "15", "πράσινος"],
342                    ["yellow", "9", "κίτρινος"]]);
343 
344     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
345                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
346 
347     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
348                   [["field_a", "field_b", "field_c"],
349                    ["red", "17", "κόκκινος"],
350                    ["blue", "12", "άσπρο"],
351                    ["green", "13.5", "κόκκινος"],
352                    ["blue", "15", "πράσινος"],
353                    ["yellow", "9", "κίτρινος"]]);
354 
355     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
356                   [["file1", "field_a", "field_b", "field_c"],
357                    ["file1", "red", "17", "κόκκινος"],
358                    ["file1", "blue", "12", "άσπρο"],
359                    ["file2", "field_a", "field_b", "field_c"],
360                    ["file2", "green", "13.5", "κόκκινος"],
361                    ["file2", "blue", "15", "πράσινος"]]);
362 
363     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
364                   [["file", "field_a", "field_b", "field_c"],
365                    ["file1", "red", "17", "κόκκινος"],
366                    ["file1", "blue", "12", "άσπρο"],
367                    ["file2", "green", "13.5", "κόκκινος"],
368                    ["file2", "blue", "15", "πράσινος"]]);
369 
370     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
371                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
372                   [["source", "field_a", "field_b", "field_c"],
373                    ["file1", "red", "17", "κόκκινος"],
374                    ["file1", "blue", "12", "άσπρο"],
375                    ["file2", "green", "13.5", "κόκκινος"],
376                    ["file2", "blue", "15", "πράσινος"],
377                    ["file3", "yellow", "9", "κίτρινος"]]);
378 
379     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
380                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
381                   [["id", "field_a", "field_b", "field_c"],
382                    ["1a", "red", "17", "κόκκινος"],
383                    ["1a", "blue", "12", "άσπρο"],
384                    ["1b", "green", "13.5", "κόκκινος"],
385                    ["1b", "blue", "15", "πράσινος"],
386                    ["1c", "yellow", "9", "κίτρινος"]]);
387 
388     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
389                    "-f", format("1b=%s", filepath2), filepath3],
390                   [["id", "field_a", "field_b", "field_c"],
391                    ["1a", "red", "17", "κόκκινος"],
392                    ["1a", "blue", "12", "άσπρο"],
393                    ["1b", "green", "13.5", "κόκκινος"],
394                    ["1b", "blue", "15", "πράσινος"],
395                    ["file3", "yellow", "9", "κίτρινος"]]);
396 }