1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_append; 11 12 import std.conv : to; 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 /** Main program. Invokes command line arg processing and tsv-append to perform 24 * the real work. Any errors are caught and reported. 25 */ 26 int main(string[] cmdArgs) 27 { 28 import tsvutil : BufferedOutputRange; 29 /* When running in DMD code coverage mode, turn on report merging. */ 30 version(D_Coverage) version(DigitalMars) 31 { 32 import core.runtime : dmd_coverSetMerge; 33 dmd_coverSetMerge(true); 34 } 35 36 TsvAppendOptions cmdopt; 37 auto r = cmdopt.processArgs(cmdArgs); 38 if (!r[0]) return r[1]; 39 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); 40 catch (Exception exc) 41 { 42 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 43 return 1; 44 } 45 return 0; 46 } 47 } 48 49 auto helpTextVerbose = q"EOS 50 Synopsis: tsv-append [options] [file...] 51 52 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 53 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 54 the first file. It also supports source tracking, adding a column indicating 55 the original file to each row. Results are written to standard output. 56 57 Concatenation with header support is useful when preparing data for traditional 58 Unix utilities like 'sort' and 'sed' or applications that read a single file. 59 60 Source tracking is useful when creating long/narrow form tabular data, a format 61 used by many statistics and data mining packages. In this scenario, files have 62 been used to capture related data sets, the difference between data sets being a 63 condition represented by the file. For example, results from different variants 64 of an experiment might each be recorded in their own files. Retaining the source 65 file as an output column preserves the condition represented by the file. 66 67 The file-name (without extension) is used as the source value. This can 68 customized using the --f|file option. 69 70 Example: Header processing: 71 72 $ tsv-append -H file1.tsv file2.tsv file3.tsv 73 74 Example: Header processing and source tracking: 75 76 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 77 78 Example: Source tracking with custom values: 79 80 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 81 82 Options: 83 EOS"; 84 85 auto helpText = q"EOS 86 Synopsis: tsv-append [options] [file...] 87 88 tsv-append concatenates multiple TSV files, reading from files or standard input 89 and writing to standard output. It is header aware ('--H|header'), writing the 90 header from only the first file. It also supports source tracking, adding an 91 indicator of original file to each row of input. 92 93 Options: 94 EOS"; 95 96 /** Container for command line options. 97 */ 98 struct TsvAppendOptions 99 { 100 string programName; 101 string[] files; // Input files 102 string[string] fileSourceNames; // Maps file path to the 'source' value 103 bool helpVerbose = false; // --help-verbose 104 string sourceHeader; // --s|source-header 105 bool trackSource = false; // --t|track-source 106 bool hasHeader = false; // --H|header 107 char delim = '\t'; // --d|delimiter 108 bool versionWanted = false; // --V|version 109 110 /* fileOptionHandler processes the '--f|file source=file' option. */ 111 private void fileOptionHandler(string option, string optionVal) 112 { 113 import std.algorithm : findSplit; 114 import std.format : format; 115 116 auto valSplit = findSplit(optionVal, "="); 117 if (valSplit[0].empty || valSplit[2].empty) 118 throw new Exception( 119 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 120 option, optionVal, option)); 121 122 auto source = valSplit[0]; 123 auto filepath = valSplit[2]; 124 files ~= filepath; 125 fileSourceNames[filepath] = source; 126 } 127 128 /** Command line argument processing. 129 * 130 * Returns a tuple. First value is true if command line arguments were successfully 131 * processed and execution should continue, or false if an error occurred or the user 132 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 133 * 134 * Returning true (execution continues) means args have been validated and derived 135 * values calculated. In addition, field indices have been converted to zero-based. 136 * If the whole line is the key, the individual fields list will be cleared. 137 */ 138 auto processArgs (ref string[] cmdArgs) 139 { 140 import std.algorithm : any, each; 141 import std.getopt; 142 import std.path : baseName, stripExtension; 143 144 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 145 146 try 147 { 148 arraySep = ","; // Use comma to separate values in command line options 149 auto r = getopt( 150 cmdArgs, 151 "help-verbose", " Print full help.", &helpVerbose, 152 std.getopt.config.caseSensitive, 153 "H|header", " Treat the first line of each file as a header.", &hasHeader, 154 std.getopt.config.caseInsensitive, 155 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 156 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 157 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 158 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 159 std.getopt.config.caseSensitive, 160 "V|version", " Print version information and exit.", &versionWanted, 161 std.getopt.config.caseInsensitive, 162 ); 163 164 if (r.helpWanted) 165 { 166 defaultGetoptPrinter(helpText, r.options); 167 return tuple(false, 0); 168 } 169 else if (helpVerbose) 170 { 171 defaultGetoptPrinter(helpTextVerbose, r.options); 172 return tuple(false, 0); 173 } 174 else if (versionWanted) 175 { 176 import tsvutils_version; 177 writeln(tsvutilsVersionNotice("tsv-append")); 178 return tuple(false, 0); 179 } 180 181 /* Derivations and consistency checks. */ 182 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 183 if (!sourceHeader.empty) hasHeader = true; 184 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 185 186 /* Assume the remaing arguments are filepaths. */ 187 foreach (fp; cmdArgs[1 .. $]) 188 { 189 import std.path : baseName, stripExtension; 190 files ~= fp; 191 fileSourceNames[fp] = fp.stripExtension.baseName; 192 } 193 194 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 195 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 196 } 197 catch (Exception exc) 198 { 199 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 200 return tuple(false, 1); 201 } 202 return tuple(true, 0); 203 } 204 } 205 206 /** tsvAppend implements the basic functionality of the tsv-append program. 207 */ 208 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) 209 if (isOutputRange!(OutputRange, char)) 210 { 211 bool headerWritten = false; 212 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 213 { 214 auto inputStream = (filename == "-") ? stdin : filename.File(); 215 auto sourceName = cmdopt.fileSourceNames[filename]; 216 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 217 { 218 if (cmdopt.hasHeader && fileLineNum == 1) 219 { 220 if (!headerWritten) 221 { 222 if (cmdopt.trackSource) 223 { 224 outputStream.put(cmdopt.sourceHeader); 225 outputStream.put(cmdopt.delim); 226 } 227 outputStream.put(line); 228 outputStream.put('\n'); 229 headerWritten = true; 230 } 231 } 232 else 233 { 234 if (cmdopt.trackSource) 235 { 236 outputStream.put(sourceName); 237 outputStream.put(cmdopt.delim); 238 } 239 outputStream.put(line); 240 outputStream.put('\n'); 241 } 242 } 243 } 244 } 245 246 version(unittest) 247 { 248 /* Unit test helper functions. */ 249 250 import unittest_utils; // tsv unit test helpers, from common/src/. 251 252 void testTsvAppend(string[] cmdArgs, string[][] expected) 253 { 254 import std.array : appender; 255 import std.format : format; 256 257 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 258 259 auto formatAssertMessage(T...)(string msg, T formatArgs) 260 { 261 auto formatString = "[testTsvAppend] %s: " ~ msg; 262 return format(formatString, cmdArgs[0], formatArgs); 263 } 264 265 TsvAppendOptions cmdopt; 266 auto savedCmdArgs = cmdArgs.to!string; 267 auto r = cmdopt.processArgs(cmdArgs); 268 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 269 270 auto output = appender!(char[])(); 271 tsvAppend(cmdopt, output); 272 auto expectedOutput = expected.tsvDataToString; 273 274 assert(output.data == expectedOutput, 275 formatAssertMessage( 276 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 277 expectedOutput.to!string, output.data.to!string)); 278 } 279 } 280 281 unittest 282 { 283 import std.path : buildPath; 284 import std.file : rmdirRecurse; 285 import std.format : format; 286 287 auto testDir = makeUnittestTempDir("tsv_append"); 288 scope(exit) testDir.rmdirRecurse; 289 290 string[][] data1 = 291 [["field_a", "field_b", "field_c"], 292 ["red", "17", "κόκκινος"], 293 ["blue", "12", "άσπρο"]]; 294 295 string[][] data2 = 296 [["field_a", "field_b", "field_c"], 297 ["green", "13.5", "κόκκινος"], 298 ["blue", "15", "πράσινος"]]; 299 300 string[][] data3 = 301 [["field_a", "field_b", "field_c"], 302 ["yellow", "9", "κίτρινος"]]; 303 304 string[][] dataHeaderRowOnly = 305 [["field_a", "field_b", "field_c"]]; 306 307 string[][] dataEmpty = [[]]; 308 309 string filepath1 = buildPath(testDir, "file1.tsv"); 310 string filepath2 = buildPath(testDir, "file2.tsv"); 311 string filepath3 = buildPath(testDir, "file3.tsv"); 312 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 313 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 314 315 writeUnittestTsvFile(filepath1, data1); 316 writeUnittestTsvFile(filepath2, data2); 317 writeUnittestTsvFile(filepath3, data3); 318 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 319 writeUnittestTsvFile(filepathEmpty, dataEmpty); 320 321 testTsvAppend(["test-1", filepath1], data1); 322 testTsvAppend(["test-2", "--header", filepath1], data1); 323 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 324 325 testTsvAppend(["test-4", "--header", filepath1, filepath2], 326 [["field_a", "field_b", "field_c"], 327 ["red", "17", "κόκκινος"], 328 ["blue", "12", "άσπρο"], 329 ["green", "13.5", "κόκκινος"], 330 ["blue", "15", "πράσινος"]]); 331 332 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 333 [["field_a", "field_b", "field_c"], 334 ["red", "17", "κόκκινος"], 335 ["blue", "12", "άσπρο"], 336 ["green", "13.5", "κόκκινος"], 337 ["blue", "15", "πράσινος"], 338 ["yellow", "9", "κίτρινος"]]); 339 340 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 341 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 342 343 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 344 [["field_a", "field_b", "field_c"], 345 ["red", "17", "κόκκινος"], 346 ["blue", "12", "άσπρο"], 347 ["green", "13.5", "κόκκινος"], 348 ["blue", "15", "πράσινος"], 349 ["yellow", "9", "κίτρινος"]]); 350 351 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 352 [["file1", "field_a", "field_b", "field_c"], 353 ["file1", "red", "17", "κόκκινος"], 354 ["file1", "blue", "12", "άσπρο"], 355 ["file2", "field_a", "field_b", "field_c"], 356 ["file2", "green", "13.5", "κόκκινος"], 357 ["file2", "blue", "15", "πράσινος"]]); 358 359 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 360 [["file", "field_a", "field_b", "field_c"], 361 ["file1", "red", "17", "κόκκινος"], 362 ["file1", "blue", "12", "άσπρο"], 363 ["file2", "green", "13.5", "κόκκινος"], 364 ["file2", "blue", "15", "πράσινος"]]); 365 366 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 367 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 368 [["source", "field_a", "field_b", "field_c"], 369 ["file1", "red", "17", "κόκκινος"], 370 ["file1", "blue", "12", "άσπρο"], 371 ["file2", "green", "13.5", "κόκκινος"], 372 ["file2", "blue", "15", "πράσινος"], 373 ["file3", "yellow", "9", "κίτρινος"]]); 374 375 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 376 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 377 [["id", "field_a", "field_b", "field_c"], 378 ["1a", "red", "17", "κόκκινος"], 379 ["1a", "blue", "12", "άσπρο"], 380 ["1b", "green", "13.5", "κόκκινος"], 381 ["1b", "blue", "15", "πράσινος"], 382 ["1c", "yellow", "9", "κίτρινος"]]); 383 384 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 385 "-f", format("1b=%s", filepath2), filepath3], 386 [["id", "field_a", "field_b", "field_c"], 387 ["1a", "red", "17", "κόκκινος"], 388 ["1a", "blue", "12", "άσπρο"], 389 ["1b", "green", "13.5", "κόκκινος"], 390 ["1b", "blue", "15", "πράσινος"], 391 ["file3", "yellow", "9", "κίτρινος"]]); 392 }