1 /** 2 Command line tool that identifies equivalent lines in an input stream. Equivalent 3 lines are identified using either the full line or a set of fields as the key. By 4 default, input is written to standard output, retaining only the first occurrence of 5 equivalent lines. There are also options for marking and numbering equivalent lines 6 rather, without filtering out duplicates. 7 8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences. 9 First, the key can be composed of individual fields, not just the full line. Second, 10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when 11 they are adjacent, hence the usual need for sorting.) 12 13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with 14 an equivalence ID, which is a one-upped counter. The other is to number lines, with 15 each unique key have its own set of numbers. 16 17 Copyright (c) 2015-2020, eBay Inc. 18 Initially written by Jon Degenhardt 19 20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 21 */ 22 module tsv_utils.tsv_uniq; 23 24 import std.exception : enforce; 25 import std.format : format; 26 import std.range; 27 import std.stdio; 28 import std.typecons : tuple; 29 30 auto helpText = q"EOS 31 Synopsis: tsv-uniq [options] [file...] 32 33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is 34 based on the entire line when key fields are not provided. Options are 35 also available for assigning a unique id to each key and numbering the 36 occurrences of each key. Fields are specified using field number or field 37 name. Field names can be used when the input file has a header line. 38 39 Use '--help-verbose' for more details. 40 Options: 41 EOS"; 42 43 auto helpTextVerbose = q"EOS 44 Synopsis: tsv-uniq [options] [file...] 45 46 tsv-uniq identifies equivalent lines in tab-separated value files. Input 47 is read line by line, recording a key for each line based on one or more 48 of the fields. Two lines are equivalent if they have the same key. The 49 first time a key is seen its line is written to standard output. 50 Subsequent lines containing the same key are discarded. This command 51 uniq's a file on fields 2 and 3: 52 53 tsv-uniq -f 2,3 file.tsv 54 55 This is similar to the Unix 'uniq' program, but based on individual 56 fields and without requiring sorted data. 57 58 Field names can be used if the input file has a header line. This command 59 uniq's a file based on the 'time' and 'date' fields: 60 61 tsv-uniq -H -f time,date file.tsv 62 63 Use '--help-fields' for details about field names. 64 65 tsv-uniq can be run without specifying a key field. In this case the 66 whole line is used as a key, same as the Unix 'uniq' program. This works 67 on any line-oriented text file, not just TSV files. 68 69 The above is the default behavior ('uniq' mode). The alternates to 'uniq' 70 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all 71 lines are written to standard output, but with a field appended marking 72 equivalent entries with an ID. The ID is a one-upped counter. Example: 73 74 tsv-uniq --header -f 2,3 --equiv file.tsv 75 76 'Number' mode also writes all lines to standard output, but with a field 77 appended numbering the occurrence count for the line's key. The first line 78 with a specific key is assigned the number '1', the second with the key is 79 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined. 80 81 The '--r|repeated' option can be used to print only lines occurring more 82 than once. Specifically, the second occurrence of a key is printed. The 83 '--a|at-least N' option is similar, printing lines occurring at least N 84 times. (Like repeated, the Nth line with the key is printed.) 85 86 The '--m|max MAX' option changes the behavior to output the first MAX 87 lines for each key, rather than just the first line for each key. 88 89 If both '--a|at-least' and '--m|max' are specified, the occurrences 90 starting with 'at-least' and ending with 'max' are output. 91 92 Options: 93 EOS"; 94 95 /** Container for command line options. 96 */ 97 struct TsvUniqOptions 98 { 99 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 100 101 enum defaultEquivHeader = "equiv_id"; 102 enum defaultEquivStartID = 1; 103 enum defaultNumberHeader = "equiv_line"; 104 105 string programName; 106 InputSourceRange inputSources; /// Input files 107 size_t[] fields; /// Derived: --f|fields 108 bool hasHeader = false; /// --H|header 109 bool onlyRepeated = false; /// --r|repeated. Shorthand for '--atleast 2' 110 size_t atLeast = 0; /// --a|at-least. Zero implies default behavior. 111 size_t max = 0; /// --m|max. Zero implies default behavior. 112 bool numberMode = false; /// --z|number 113 string numberHeader = defaultNumberHeader; /// --number-header 114 bool equivMode = false; /// --e|equiv 115 string equivHeader = defaultEquivHeader; /// --equiv-header 116 long equivStartID = defaultEquivStartID; /// --equiv-start 117 bool ignoreCase = false; /// --i|ignore-case 118 char delim = '\t'; /// --d|delimiter 119 bool keyIsFullLine = false; /// Derived. True if no fields specified or '--f|fields 0' 120 121 /* Returns a tuple. First value is true if command line arguments were successfully 122 * processed and execution should continue, or false if an error occurred or the user 123 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 124 * 125 * Returning true (execution continues) means args have been validated and derived 126 * values calculated. In addition, field indices have been converted to zero-based. 127 * If the whole line is the key, the individual fields list will be cleared. 128 * 129 * Repeat count control variables 'atLeast' and max' - These values are left at zero 130 * if no repeat count options are specified. They are set if repeat count options 131 * are specified, as follows: 132 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 133 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 134 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 135 * 'atLeast' if --r|repeated or --a|at-least is provided. 136 * 137 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 138 * represents the default "output all values" case. In this case max may be less 139 * than the at-least value. 140 */ 141 auto processArgs (ref string[] cmdArgs) 142 { 143 import std.algorithm : all, each; 144 import std.conv : to; 145 import std.getopt; 146 import std.path : baseName, stripExtension; 147 import std.typecons : Yes, No; 148 import tsv_utils.common.fieldlist; 149 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 150 151 bool helpVerbose = false; // --h|help-verbose 152 bool helpFields = false; // --help-fields 153 bool versionWanted = false; // --V|version 154 string fieldsArg; // --f|fields 155 156 string fieldsOptionString = "f|fields"; 157 158 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 159 160 try 161 { 162 arraySep = ","; // Use comma to separate values in command line options 163 auto r = getopt( 164 cmdArgs, 165 "help-verbose", " Print full help.", &helpVerbose, 166 "help-fields", " Print help on specifying fields.", &helpFields, 167 std.getopt.config.caseSensitive, 168 "V|version", " Print version information and exit.", &versionWanted, 169 "H|header", " Treat the first line of each file as a header.", &hasHeader, 170 std.getopt.config.caseInsensitive, 171 172 fieldsOptionString, "<field-list> Fields to use as the key. Default: 0 (entire line).", &fieldsArg, 173 174 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 175 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 176 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 177 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 178 "e|equiv", " Output equivalence class IDs rather than uniq'ing entries.", &equivMode, 179 "equiv-header", "STR Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader, 180 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 181 "z|number", " Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode, 182 "number-header", "STR Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader, 183 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 184 ); 185 186 if (r.helpWanted) 187 { 188 defaultGetoptPrinter(helpText, r.options); 189 return tuple(false, 0); 190 } 191 else if (helpVerbose) 192 { 193 defaultGetoptPrinter(helpTextVerbose, r.options); 194 return tuple(false, 0); 195 } 196 else if (helpFields) 197 { 198 writeln(fieldListHelpText); 199 return tuple(false, 0); 200 } 201 else if (versionWanted) 202 { 203 import tsv_utils.common.tsvutils_version; 204 writeln(tsvutilsVersionNotice("tsv-uniq")); 205 return tuple(false, 0); 206 } 207 208 /* Input files. Remaining command line args are files. */ 209 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 210 cmdArgs.length = 1; 211 212 /* Validation - Do as much validation prior to header line processing as 213 * possible (avoids waiting on stdin). 214 */ 215 if (!equivMode) 216 { 217 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv"); 218 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv"); 219 } 220 221 enforce(numberMode || numberHeader == defaultNumberHeader, 222 "--number-header requires --z|number"); 223 224 string[] headerFields; 225 226 /* fieldListArgProcessing encapsulates the field list processing. It is 227 * called prior to reading the header line if headers are not being used, 228 * and after if headers are being used. 229 */ 230 void fieldListArgProcessing() 231 { 232 if (!fieldsArg.empty) 233 { 234 fields = 235 fieldsArg 236 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 237 (hasHeader, headerFields, fieldsOptionString) 238 .array; 239 } 240 241 enforce(fields.length <= 1 || fields.all!(x => x != 0), 242 "Whole line as key (--f|field 0) cannot be combined with multiple fields."); 243 244 if (fields.length == 0) 245 { 246 keyIsFullLine = true; 247 } 248 else if (fields.length == 1 && fields[0] == 0) 249 { 250 keyIsFullLine = true; 251 fields.length = 0; 252 } 253 254 if (onlyRepeated && atLeast <= 1) atLeast = 2; 255 if (atLeast >= 2 && max < atLeast) 256 { 257 // Don't modify max if it is zero and equivMode or numberMode is in effect. 258 if (max != 0 || (!equivMode && !numberMode)) max = atLeast; 259 } 260 261 if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. 262 } 263 264 if (!hasHeader) fieldListArgProcessing(); 265 266 /* 267 * Create the inputSourceRange and perform header line processing. 268 */ 269 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 270 inputSources = inputSourceRange(filepaths, readHeader); 271 272 if (hasHeader) 273 { 274 throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 275 headerFields = inputSources.front.header.split(delim).to!(string[]); 276 fieldListArgProcessing(); 277 } 278 } 279 catch (Exception exc) 280 { 281 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 282 return tuple(false, 1); 283 } 284 return tuple(true, 0); 285 } 286 } 287 288 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 289 290 /** Main program. Processes command line arguments and calls tsvUniq which implements 291 * the main processing logic. 292 */ 293 int main(string[] cmdArgs) 294 { 295 /* When running in DMD code coverage mode, turn on report merging. */ 296 version(D_Coverage) version(DigitalMars) 297 { 298 import core.runtime : dmd_coverSetMerge; 299 dmd_coverSetMerge(true); 300 } 301 302 TsvUniqOptions cmdopt; 303 auto r = cmdopt.processArgs(cmdArgs); 304 if (!r[0]) return r[1]; 305 306 version(LDC_Profile) 307 { 308 import ldc.profile : resetAll; 309 resetAll(); 310 } 311 312 try tsvUniq(cmdopt); 313 catch (Exception exc) 314 { 315 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 316 return 1; 317 } 318 return 0; 319 } 320 321 /** Outputs the unique lines from all the input files. 322 * 323 * Processes the lines in each input file. All lines are added to an associated array. 324 * The first time a line is seen it is output. If key fields are being used these are 325 * used as the basis for the associative array entries rather than the full line. 326 */ 327 void tsvUniq(ref TsvUniqOptions cmdopt) 328 { 329 import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, 330 InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix; 331 import std.algorithm : splitter; 332 import std.array : appender; 333 import std.conv : to; 334 import std.uni : asLowerCase; 335 import std.utf : byChar; 336 337 /* inputSources must be an InputSourceRange and include at least stdin. */ 338 assert(!cmdopt.inputSources.empty); 339 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 340 341 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 342 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); 343 344 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 345 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 346 347 /* The master hash. The key is the specified fields concatenated together (including 348 * separators). The value is a struct with the equiv-id and occurrence count. 349 */ 350 static struct EquivEntry { size_t equivID; size_t count; } 351 EquivEntry[string] equivHash; 352 353 /* Reusable buffers for multi-field keys and case-insensitive keys. */ 354 auto multiFieldKeyBuffer = appender!(char[]); 355 auto lowerKeyBuffer = appender!(char[]); 356 357 const size_t numKeyFields = cmdopt.fields.length; 358 long nextEquivID = cmdopt.equivStartID; 359 360 /* First header is read during command line arg processing. Flush it immediately 361 * so subsequent processes in a unix command pipeline see it early. This helps 362 * provide timely error messages. 363 */ 364 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 365 { 366 auto inputStream = cmdopt.inputSources.front; 367 368 bufferedOutput.append(inputStream.header); 369 370 if (cmdopt.equivMode) 371 { 372 bufferedOutput.append(cmdopt.delim); 373 bufferedOutput.append(cmdopt.equivHeader); 374 } 375 376 if (cmdopt.numberMode) 377 { 378 bufferedOutput.append(cmdopt.delim); 379 bufferedOutput.append(cmdopt.numberHeader); 380 } 381 382 bufferedOutput.appendln(); 383 bufferedOutput.flush(); 384 } 385 386 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 387 388 foreach (inputStream; cmdopt.inputSources) 389 { 390 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 391 392 foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 393 { 394 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 395 396 /* Start by finding the key. */ 397 typeof(line) key; 398 if (cmdopt.keyIsFullLine) 399 { 400 key = line; 401 } 402 else 403 { 404 assert(keyFieldsReordering !is null); 405 406 /* Copy the key fields to a new buffer. */ 407 keyFieldsReordering.initNewLine; 408 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 409 { 410 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 411 if (keyFieldsReordering.allFieldsFilled) break; 412 } 413 414 enforce(keyFieldsReordering.allFieldsFilled, 415 format("Not enough fields in line. File: %s, Line: %s", 416 inputStream.name, lineNum)); 417 418 if (numKeyFields == 1) 419 { 420 key = keyFieldsReordering.outputFields[0]; 421 } 422 else 423 { 424 multiFieldKeyBuffer.clear(); 425 keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); 426 key = multiFieldKeyBuffer.data; 427 } 428 } 429 430 if (cmdopt.ignoreCase) 431 { 432 /* Equivalent to key = key.toLower, but without memory allocation. */ 433 lowerKeyBuffer.clear(); 434 lowerKeyBuffer.put(key.asLowerCase.byChar); 435 key = lowerKeyBuffer.data; 436 } 437 438 bool isOutput = false; 439 EquivEntry currEntry; 440 EquivEntry* priorEntry = (key in equivHash); 441 if (priorEntry is null) 442 { 443 isOutput = (cmdopt.atLeast <= 1); 444 currEntry.equivID = nextEquivID; 445 currEntry.count = 1; 446 equivHash[key.to!string] = currEntry; 447 nextEquivID++; 448 } 449 else 450 { 451 (*priorEntry).count++; 452 currEntry = *priorEntry; 453 454 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 455 (cmdopt.equivMode && cmdopt.max == 0) || 456 (cmdopt.numberMode && cmdopt.max == 0)) 457 { 458 isOutput = true; 459 } 460 } 461 462 if (isOutput) 463 { 464 bufferedOutput.append(line); 465 466 if (cmdopt.equivMode) 467 { 468 bufferedOutput.append(cmdopt.delim); 469 bufferedOutput.append(currEntry.equivID.to!string); 470 } 471 472 if (cmdopt.numberMode) 473 { 474 bufferedOutput.append(cmdopt.delim); 475 bufferedOutput.append(currEntry.count.to!string); 476 } 477 478 bufferedOutput.appendln(); 479 } 480 } 481 } 482 }