1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_append; 11 12 import std.conv : to; 13 import std.exception : enforce; 14 import std.range; 15 import std.stdio; 16 import std.typecons : tuple; 17 18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 19 20 version(unittest) 21 { 22 // When running unit tests, use main from -main compiler switch. 23 } 24 else 25 { 26 /** Main program. Invokes command line arg processing and tsv-append to perform 27 * the real work. Any errors are caught and reported. 28 */ 29 int main(string[] cmdArgs) 30 { 31 import tsv_utils.common.utils : BufferedOutputRange; 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvAppendOptions cmdopt; 40 auto r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); 43 catch (Exception exc) 44 { 45 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 46 return 1; 47 } 48 return 0; 49 } 50 } 51 52 auto helpTextVerbose = q"EOS 53 Synopsis: tsv-append [options] [file...] 54 55 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 56 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 57 the first file. It also supports source tracking, adding a column indicating 58 the original file to each row. Results are written to standard output. 59 60 Concatenation with header support is useful when preparing data for traditional 61 Unix utilities like 'sort' and 'sed' or applications that read a single file. 62 63 Source tracking is useful when creating long/narrow form tabular data, a format 64 used by many statistics and data mining packages. In this scenario, files have 65 been used to capture related data sets, the difference between data sets being a 66 condition represented by the file. For example, results from different variants 67 of an experiment might each be recorded in their own files. Retaining the source 68 file as an output column preserves the condition represented by the file. 69 70 The file-name (without extension) is used as the source value. This can 71 customized using the --f|file option. 72 73 Example: Header processing: 74 75 $ tsv-append -H file1.tsv file2.tsv file3.tsv 76 77 Example: Header processing and source tracking: 78 79 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 80 81 Example: Source tracking with custom values: 82 83 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 84 85 Options: 86 EOS"; 87 88 auto helpText = q"EOS 89 Synopsis: tsv-append [options] [file...] 90 91 tsv-append concatenates multiple TSV files, reading from files or standard input 92 and writing to standard output. It is header aware ('--H|header'), writing the 93 header from only the first file. It also supports source tracking, adding an 94 indicator of original file to each row of input. 95 96 Options: 97 EOS"; 98 99 /** Container for command line options. 100 */ 101 struct TsvAppendOptions 102 { 103 string programName; 104 string[] files; // Input files 105 string[string] fileSourceNames; // Maps file path to the 'source' value 106 bool helpVerbose = false; // --help-verbose 107 string sourceHeader; // --s|source-header 108 bool trackSource = false; // --t|track-source 109 bool hasHeader = false; // --H|header 110 char delim = '\t'; // --d|delimiter 111 bool versionWanted = false; // --V|version 112 113 /* fileOptionHandler processes the '--f|file source=file' option. */ 114 private void fileOptionHandler(string option, string optionVal) pure @safe 115 { 116 import std.algorithm : findSplit; 117 import std.format : format; 118 119 auto valSplit = findSplit(optionVal, "="); 120 121 enforce(!valSplit[0].empty && !valSplit[2].empty, 122 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 123 option, optionVal, option)); 124 125 auto source = valSplit[0]; 126 auto filepath = valSplit[2]; 127 files ~= filepath; 128 fileSourceNames[filepath] = source; 129 } 130 131 /** Command line argument processing. 132 * 133 * Returns a tuple. First value is true if command line arguments were successfully 134 * processed and execution should continue, or false if an error occurred or the user 135 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 136 * 137 * Returning true (execution continues) means args have been validated and derived 138 * values calculated. In addition, field indices have been converted to zero-based. 139 * If the whole line is the key, the individual fields list will be cleared. 140 */ 141 auto processArgs (ref string[] cmdArgs) 142 { 143 import std.algorithm : any, each; 144 import std.getopt; 145 import std.path : baseName, stripExtension; 146 147 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 148 149 try 150 { 151 arraySep = ","; // Use comma to separate values in command line options 152 auto r = getopt( 153 cmdArgs, 154 "help-verbose", " Print full help.", &helpVerbose, 155 std.getopt.config.caseSensitive, 156 "H|header", " Treat the first line of each file as a header.", &hasHeader, 157 std.getopt.config.caseInsensitive, 158 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 159 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 160 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 161 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 162 std.getopt.config.caseSensitive, 163 "V|version", " Print version information and exit.", &versionWanted, 164 std.getopt.config.caseInsensitive, 165 ); 166 167 if (r.helpWanted) 168 { 169 defaultGetoptPrinter(helpText, r.options); 170 return tuple(false, 0); 171 } 172 else if (helpVerbose) 173 { 174 defaultGetoptPrinter(helpTextVerbose, r.options); 175 return tuple(false, 0); 176 } 177 else if (versionWanted) 178 { 179 import tsv_utils.common.tsvutils_version; 180 writeln(tsvutilsVersionNotice("tsv-append")); 181 return tuple(false, 0); 182 } 183 184 /* Derivations and consistency checks. */ 185 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 186 if (!sourceHeader.empty) hasHeader = true; 187 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 188 189 /* Assume the remaing arguments are filepaths. */ 190 foreach (fp; cmdArgs[1 .. $]) 191 { 192 import std.path : baseName, stripExtension; 193 files ~= fp; 194 fileSourceNames[fp] = fp.stripExtension.baseName; 195 } 196 197 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 198 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 199 } 200 catch (Exception exc) 201 { 202 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 203 return tuple(false, 1); 204 } 205 return tuple(true, 0); 206 } 207 } 208 209 /** tsvAppend implements the basic functionality of the tsv-append program. 210 */ 211 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) 212 if (isOutputRange!(OutputRange, char)) 213 { 214 import tsv_utils.common.utils : bufferedByLine; 215 216 bool headerWritten = false; 217 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 218 { 219 auto inputStream = (filename == "-") ? stdin : filename.File(); 220 auto sourceName = cmdopt.fileSourceNames[filename]; 221 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 222 { 223 if (cmdopt.hasHeader && fileLineNum == 1) 224 { 225 if (!headerWritten) 226 { 227 if (cmdopt.trackSource) 228 { 229 outputStream.put(cmdopt.sourceHeader); 230 outputStream.put(cmdopt.delim); 231 } 232 outputStream.put(line); 233 outputStream.put('\n'); 234 headerWritten = true; 235 } 236 } 237 else 238 { 239 if (cmdopt.trackSource) 240 { 241 outputStream.put(sourceName); 242 outputStream.put(cmdopt.delim); 243 } 244 outputStream.put(line); 245 outputStream.put('\n'); 246 } 247 } 248 } 249 } 250 251 version(unittest) 252 { 253 /* Unit test helper functions. */ 254 255 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 256 257 void testTsvAppend(string[] cmdArgs, string[][] expected) 258 { 259 import std.array : appender; 260 import std.format : format; 261 262 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 263 264 auto formatAssertMessage(T...)(string msg, T formatArgs) 265 { 266 auto formatString = "[testTsvAppend] %s: " ~ msg; 267 return format(formatString, cmdArgs[0], formatArgs); 268 } 269 270 TsvAppendOptions cmdopt; 271 auto savedCmdArgs = cmdArgs.to!string; 272 auto r = cmdopt.processArgs(cmdArgs); 273 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 274 275 auto output = appender!(char[])(); 276 tsvAppend(cmdopt, output); 277 auto expectedOutput = expected.tsvDataToString; 278 279 assert(output.data == expectedOutput, 280 formatAssertMessage( 281 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 282 expectedOutput.to!string, output.data.to!string)); 283 } 284 } 285 286 unittest 287 { 288 import std.path : buildPath; 289 import std.file : rmdirRecurse; 290 import std.format : format; 291 292 auto testDir = makeUnittestTempDir("tsv_append"); 293 scope(exit) testDir.rmdirRecurse; 294 295 string[][] data1 = 296 [["field_a", "field_b", "field_c"], 297 ["red", "17", "κόκκινος"], 298 ["blue", "12", "άσπρο"]]; 299 300 string[][] data2 = 301 [["field_a", "field_b", "field_c"], 302 ["green", "13.5", "κόκκινος"], 303 ["blue", "15", "πράσινος"]]; 304 305 string[][] data3 = 306 [["field_a", "field_b", "field_c"], 307 ["yellow", "9", "κίτρινος"]]; 308 309 string[][] dataHeaderRowOnly = 310 [["field_a", "field_b", "field_c"]]; 311 312 string[][] dataEmpty = [[]]; 313 314 string filepath1 = buildPath(testDir, "file1.tsv"); 315 string filepath2 = buildPath(testDir, "file2.tsv"); 316 string filepath3 = buildPath(testDir, "file3.tsv"); 317 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 318 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 319 320 writeUnittestTsvFile(filepath1, data1); 321 writeUnittestTsvFile(filepath2, data2); 322 writeUnittestTsvFile(filepath3, data3); 323 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 324 writeUnittestTsvFile(filepathEmpty, dataEmpty); 325 326 testTsvAppend(["test-1", filepath1], data1); 327 testTsvAppend(["test-2", "--header", filepath1], data1); 328 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 329 330 testTsvAppend(["test-4", "--header", filepath1, filepath2], 331 [["field_a", "field_b", "field_c"], 332 ["red", "17", "κόκκινος"], 333 ["blue", "12", "άσπρο"], 334 ["green", "13.5", "κόκκινος"], 335 ["blue", "15", "πράσινος"]]); 336 337 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 338 [["field_a", "field_b", "field_c"], 339 ["red", "17", "κόκκινος"], 340 ["blue", "12", "άσπρο"], 341 ["green", "13.5", "κόκκινος"], 342 ["blue", "15", "πράσινος"], 343 ["yellow", "9", "κίτρινος"]]); 344 345 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 346 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 347 348 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 349 [["field_a", "field_b", "field_c"], 350 ["red", "17", "κόκκινος"], 351 ["blue", "12", "άσπρο"], 352 ["green", "13.5", "κόκκινος"], 353 ["blue", "15", "πράσινος"], 354 ["yellow", "9", "κίτρινος"]]); 355 356 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 357 [["file1", "field_a", "field_b", "field_c"], 358 ["file1", "red", "17", "κόκκινος"], 359 ["file1", "blue", "12", "άσπρο"], 360 ["file2", "field_a", "field_b", "field_c"], 361 ["file2", "green", "13.5", "κόκκινος"], 362 ["file2", "blue", "15", "πράσινος"]]); 363 364 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 365 [["file", "field_a", "field_b", "field_c"], 366 ["file1", "red", "17", "κόκκινος"], 367 ["file1", "blue", "12", "άσπρο"], 368 ["file2", "green", "13.5", "κόκκινος"], 369 ["file2", "blue", "15", "πράσινος"]]); 370 371 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 372 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 373 [["source", "field_a", "field_b", "field_c"], 374 ["file1", "red", "17", "κόκκινος"], 375 ["file1", "blue", "12", "άσπρο"], 376 ["file2", "green", "13.5", "κόκκινος"], 377 ["file2", "blue", "15", "πράσινος"], 378 ["file3", "yellow", "9", "κίτρινος"]]); 379 380 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 381 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 382 [["id", "field_a", "field_b", "field_c"], 383 ["1a", "red", "17", "κόκκινος"], 384 ["1a", "blue", "12", "άσπρο"], 385 ["1b", "green", "13.5", "κόκκινος"], 386 ["1b", "blue", "15", "πράσινος"], 387 ["1c", "yellow", "9", "κίτρινος"]]); 388 389 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 390 "-f", format("1b=%s", filepath2), filepath3], 391 [["id", "field_a", "field_b", "field_c"], 392 ["1a", "red", "17", "κόκκινος"], 393 ["1a", "blue", "12", "άσπρο"], 394 ["1b", "green", "13.5", "κόκκινος"], 395 ["1b", "blue", "15", "πράσινος"], 396 ["file3", "yellow", "9", "κίτρινος"]]); 397 }