1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2021, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_append; 11 12 import std.conv : to; 13 import std.exception : enforce; 14 import std.range; 15 import std.stdio; 16 import std.typecons : tuple; 17 18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 19 20 version(unittest) 21 { 22 // When running unit tests, use main from -main compiler switch. 23 } 24 else 25 { 26 /** Main program. Invokes command line arg processing and tsv-append to perform 27 * the real work. Any errors are caught and reported. 28 */ 29 int main(string[] cmdArgs) 30 { 31 import tsv_utils.common.utils : BufferedOutputRange, LineBuffered; 32 33 /* When running in DMD code coverage mode, turn on report merging. */ 34 version(D_Coverage) version(DigitalMars) 35 { 36 import core.runtime : dmd_coverSetMerge; 37 dmd_coverSetMerge(true); 38 } 39 40 TsvAppendOptions cmdopt; 41 auto r = cmdopt.processArgs(cmdArgs); 42 if (!r[0]) return r[1]; 43 44 immutable LineBuffered linebuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 45 46 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout, linebuffered)); 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpTextVerbose = q"EOS 57 Synopsis: tsv-append [options] [file...] 58 59 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 60 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 61 the first file. It also supports source tracking, adding a column indicating 62 the original file to each row. Results are written to standard output. 63 64 Concatenation with header support is useful when preparing data for traditional 65 Unix utilities like 'sort' and 'sed' or applications that read a single file. 66 67 Source tracking is useful when creating long/narrow form tabular data, a format 68 used by many statistics and data mining packages. In this scenario, files have 69 been used to capture related data sets, the difference between data sets being a 70 condition represented by the file. For example, results from different variants 71 of an experiment might each be recorded in their own files. Retaining the source 72 file as an output column preserves the condition represented by the file. 73 74 The file-name (without extension) is used as the source value. This can 75 customized using the --f|file option. 76 77 Example: Header processing: 78 79 $ tsv-append -H file1.tsv file2.tsv file3.tsv 80 81 Example: Header processing and source tracking: 82 83 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 84 85 Example: Source tracking with custom values: 86 87 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 88 89 Options: 90 EOS"; 91 92 auto helpText = q"EOS 93 Synopsis: tsv-append [options] [file...] 94 95 tsv-append concatenates multiple TSV files, reading from files or standard input 96 and writing to standard output. It is header aware ('--H|header'), writing the 97 header from only the first file. It also supports source tracking, adding an 98 indicator of original file to each row of input. 99 100 Options: 101 EOS"; 102 103 /** Container for command line options. 104 */ 105 struct TsvAppendOptions 106 { 107 string programName; 108 string[] files; /// Input files 109 string[string] fileSourceNames; /// Maps file path to the 'source' value 110 string sourceHeader; /// --s|source-header 111 bool trackSource = false; /// --t|track-source 112 bool hasHeader = false; /// --H|header 113 char delim = '\t'; /// --d|delimiter 114 bool lineBuffered = false; /// --line-buffered 115 116 /* fileOptionHandler processes the '--f|file source=file' option. */ 117 private void fileOptionHandler(string option, string optionVal) pure @safe 118 { 119 import std.algorithm : findSplit; 120 import std.format : format; 121 122 auto valSplit = findSplit(optionVal, "="); 123 124 enforce(!valSplit[0].empty && !valSplit[2].empty, 125 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 126 option, optionVal, option)); 127 128 auto source = valSplit[0]; 129 auto filepath = valSplit[2]; 130 files ~= filepath; 131 fileSourceNames[filepath] = source; 132 } 133 134 /** Command line argument processing. 135 * 136 * Returns a tuple. First value is true if command line arguments were successfully 137 * processed and execution should continue, or false if an error occurred or the user 138 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 139 * 140 * Returning true (execution continues) means args have been validated and derived 141 * values calculated. In addition, field indices have been converted to zero-based. 142 * If the whole line is the key, the individual fields list will be cleared. 143 */ 144 auto processArgs (ref string[] cmdArgs) 145 { 146 import std.algorithm : any, each; 147 import std.getopt; 148 import std.path : baseName, stripExtension; 149 150 bool helpVerbose = false; // --help-verbose 151 bool versionWanted = false; // --V|version 152 153 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 154 155 try 156 { 157 arraySep = ","; // Use comma to separate values in command line options 158 auto r = getopt( 159 cmdArgs, 160 "help-verbose", " Print full help.", &helpVerbose, 161 std.getopt.config.caseSensitive, 162 "H|header", " Treat the first line of each file as a header.", &hasHeader, 163 std.getopt.config.caseInsensitive, 164 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 165 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 166 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 167 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 168 "line-buffered", " Immediately output every line.", &lineBuffered, 169 std.getopt.config.caseSensitive, 170 "V|version", " Print version information and exit.", &versionWanted, 171 std.getopt.config.caseInsensitive, 172 ); 173 174 if (r.helpWanted) 175 { 176 defaultGetoptPrinter(helpText, r.options); 177 return tuple(false, 0); 178 } 179 else if (helpVerbose) 180 { 181 defaultGetoptPrinter(helpTextVerbose, r.options); 182 return tuple(false, 0); 183 } 184 else if (versionWanted) 185 { 186 import tsv_utils.common.tsvutils_version; 187 writeln(tsvutilsVersionNotice("tsv-append")); 188 return tuple(false, 0); 189 } 190 191 /* Derivations and consistency checks. */ 192 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 193 if (!sourceHeader.empty) hasHeader = true; 194 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 195 196 /* Assume the remaing arguments are filepaths. */ 197 foreach (fp; cmdArgs[1 .. $]) 198 { 199 import std.path : baseName, stripExtension; 200 files ~= fp; 201 fileSourceNames[fp] = fp.stripExtension.baseName; 202 } 203 204 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 205 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 206 } 207 catch (Exception exc) 208 { 209 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 210 return tuple(false, 1); 211 } 212 return tuple(true, 0); 213 } 214 } 215 216 /** tsvAppend implements the basic functionality of the tsv-append program. 217 */ 218 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) 219 if (isOutputRange!(OutputRange, char)) 220 { 221 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, LineBuffered, 222 ReadHeader; 223 224 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 225 immutable ReadHeader useReadHeader = cmdopt.hasHeader ? Yes.readHeader : No.readHeader; 226 227 bool headerWritten = false; 228 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 229 { 230 auto inputStream = (filename == "-") ? stdin : filename.File(); 231 auto sourceName = cmdopt.fileSourceNames[filename]; 232 foreach (fileLineNum, line; 233 inputStream 234 .bufferedByLine!(KeepTerminator.no)(isLineBuffered, useReadHeader) 235 .enumerate(1)) 236 { 237 if (cmdopt.hasHeader && fileLineNum == 1) 238 { 239 if (!headerWritten) 240 { 241 if (cmdopt.trackSource) 242 { 243 outputStream.put(cmdopt.sourceHeader); 244 outputStream.put(cmdopt.delim); 245 } 246 outputStream.put(line); 247 outputStream.put('\n'); 248 headerWritten = true; 249 250 /* Flush the header immediately. This helps tasks further on in a 251 * unix pipeline detect errors quickly, without waiting for all 252 * the data to flow through the pipeline. Note that an upstream 253 * task may have flushed its header line, so the header may 254 * arrive long before the main block of data. 255 */ 256 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 257 } 258 } 259 else 260 { 261 if (cmdopt.trackSource) 262 { 263 outputStream.put(sourceName); 264 outputStream.put(cmdopt.delim); 265 } 266 outputStream.put(line); 267 outputStream.put('\n'); 268 } 269 } 270 /* Files don't always close quickly enough on thier own. */ 271 if (filename != "-") inputStream.close; 272 } 273 } 274 275 version(unittest) 276 { 277 /* Unit test helper functions. */ 278 279 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 280 281 void testTsvAppend(string[] cmdArgs, string[][] expected) 282 { 283 import std.array : appender; 284 import std.format : format; 285 286 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 287 288 auto formatAssertMessage(T...)(string msg, T formatArgs) 289 { 290 auto formatString = "[testTsvAppend] %s: " ~ msg; 291 return format(formatString, cmdArgs[0], formatArgs); 292 } 293 294 TsvAppendOptions cmdopt; 295 auto savedCmdArgs = cmdArgs.to!string; 296 auto r = cmdopt.processArgs(cmdArgs); 297 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 298 299 auto output = appender!(char[])(); 300 tsvAppend(cmdopt, output); 301 auto expectedOutput = expected.tsvDataToString; 302 303 assert(output.data == expectedOutput, 304 formatAssertMessage( 305 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 306 expectedOutput.to!string, output.data.to!string)); 307 } 308 } 309 310 unittest 311 { 312 import std.path : buildPath; 313 import std.file : rmdirRecurse; 314 import std.format : format; 315 316 auto testDir = makeUnittestTempDir("tsv_append"); 317 scope(exit) testDir.rmdirRecurse; 318 319 string[][] data1 = 320 [["field_a", "field_b", "field_c"], 321 ["red", "17", "κόκκινος"], 322 ["blue", "12", "άσπρο"]]; 323 324 string[][] data2 = 325 [["field_a", "field_b", "field_c"], 326 ["green", "13.5", "κόκκινος"], 327 ["blue", "15", "πράσινος"]]; 328 329 string[][] data3 = 330 [["field_a", "field_b", "field_c"], 331 ["yellow", "9", "κίτρινος"]]; 332 333 string[][] dataHeaderRowOnly = 334 [["field_a", "field_b", "field_c"]]; 335 336 string[][] dataEmpty = [[]]; 337 338 string filepath1 = buildPath(testDir, "file1.tsv"); 339 string filepath2 = buildPath(testDir, "file2.tsv"); 340 string filepath3 = buildPath(testDir, "file3.tsv"); 341 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 342 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 343 344 writeUnittestTsvFile(filepath1, data1); 345 writeUnittestTsvFile(filepath2, data2); 346 writeUnittestTsvFile(filepath3, data3); 347 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 348 writeUnittestTsvFile(filepathEmpty, dataEmpty); 349 350 testTsvAppend(["test-1", filepath1], data1); 351 testTsvAppend(["test-2", "--header", filepath1], data1); 352 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 353 354 testTsvAppend(["test-4", "--header", filepath1, filepath2], 355 [["field_a", "field_b", "field_c"], 356 ["red", "17", "κόκκινος"], 357 ["blue", "12", "άσπρο"], 358 ["green", "13.5", "κόκκινος"], 359 ["blue", "15", "πράσινος"]]); 360 361 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 362 [["field_a", "field_b", "field_c"], 363 ["red", "17", "κόκκινος"], 364 ["blue", "12", "άσπρο"], 365 ["green", "13.5", "κόκκινος"], 366 ["blue", "15", "πράσινος"], 367 ["yellow", "9", "κίτρινος"]]); 368 369 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 370 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 371 372 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 373 [["field_a", "field_b", "field_c"], 374 ["red", "17", "κόκκινος"], 375 ["blue", "12", "άσπρο"], 376 ["green", "13.5", "κόκκινος"], 377 ["blue", "15", "πράσινος"], 378 ["yellow", "9", "κίτρινος"]]); 379 380 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 381 [["file1", "field_a", "field_b", "field_c"], 382 ["file1", "red", "17", "κόκκινος"], 383 ["file1", "blue", "12", "άσπρο"], 384 ["file2", "field_a", "field_b", "field_c"], 385 ["file2", "green", "13.5", "κόκκινος"], 386 ["file2", "blue", "15", "πράσινος"]]); 387 388 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 389 [["file", "field_a", "field_b", "field_c"], 390 ["file1", "red", "17", "κόκκινος"], 391 ["file1", "blue", "12", "άσπρο"], 392 ["file2", "green", "13.5", "κόκκινος"], 393 ["file2", "blue", "15", "πράσινος"]]); 394 395 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 396 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 397 [["source", "field_a", "field_b", "field_c"], 398 ["file1", "red", "17", "κόκκινος"], 399 ["file1", "blue", "12", "άσπρο"], 400 ["file2", "green", "13.5", "κόκκινος"], 401 ["file2", "blue", "15", "πράσινος"], 402 ["file3", "yellow", "9", "κίτρινος"]]); 403 404 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 405 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 406 [["id", "field_a", "field_b", "field_c"], 407 ["1a", "red", "17", "κόκκινος"], 408 ["1a", "blue", "12", "άσπρο"], 409 ["1b", "green", "13.5", "κόκκινος"], 410 ["1b", "blue", "15", "πράσινος"], 411 ["1c", "yellow", "9", "κίτρινος"]]); 412 413 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 414 "-f", format("1b=%s", filepath2), filepath3], 415 [["id", "field_a", "field_b", "field_c"], 416 ["1a", "red", "17", "κόκκινος"], 417 ["1a", "blue", "12", "άσπρο"], 418 ["1b", "green", "13.5", "κόκκινος"], 419 ["1b", "blue", "15", "πράσινος"], 420 ["file3", "yellow", "9", "κίτρινος"]]); 421 422 testTsvAppend(["test-13", "--line-buffered", filepath1], data1); 423 testTsvAppend(["test-14", "--line-buffered", "--header", filepath1], data1); 424 testTsvAppend(["test-15", "--line-buffered", filepath1, filepath2], data1 ~ data2); 425 testTsvAppend(["test-16", "--line-buffered", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 426 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 427 428 testTsvAppend(["test-17", "--line-buffered", "-s", "id", "-f", format("1a=%s", filepath1), 429 "-f", format("1b=%s", filepath2), filepath3], 430 [["id", "field_a", "field_b", "field_c"], 431 ["1a", "red", "17", "κόκκινος"], 432 ["1a", "blue", "12", "άσπρο"], 433 ["1b", "green", "13.5", "κόκκινος"], 434 ["1b", "blue", "15", "πράσινος"], 435 ["file3", "yellow", "9", "κίτρινος"]]); 436 }