1 /** 2 Command line tool using fields in a tab-separated value file to identify equivalent 3 lines. Can either remove the duplicate entries or mark as equivalence classes. 4 5 This tool reads a tab-separated value file line by line, using one or more fields to 6 record a key. If the same key is found in a subsequent line, it is identified as 7 equivalent. When operating in 'uniq' mode, the first time a key is seen the line is 8 written to standard output, but subsequent matching lines are discarded. 9 10 The alternate to 'uniq' is 'equiv-class' identification. In this mode, all lines 11 written to standard output, but a new field is added marking equivalent entries with 12 with an ID. The ID is simply a one-upped counter. 13 14 Copyright (c) 2015-2018, eBay Software Foundation 15 Initially written by Jon Degenhardt 16 17 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 18 */ 19 module tsv_uniq; 20 21 import std.stdio; 22 import std.format : format; 23 import std.typecons : tuple; 24 25 auto helpText = q"EOS 26 Synopsis: tsv-uniq [options] [file...] 27 28 tsv-uniq filters out duplicate lines using fields as a key. Filtering is based 29 on the entire line if a key is not provided. 30 31 Options: 32 EOS"; 33 34 auto helpTextVerbose = q"EOS 35 Synopsis: tsv-uniq [options] [file...] 36 37 tsv-uniq identifies equivalent lines in tab-separated value files. Input is read 38 line by line, recording a key based on one or more of the fields. Two lines are 39 equivalent if they have the same key. When operating in 'uniq' mode, the first 40 time a key is seen the line is written to standard output, but subsequent lines 41 are discarded. This is similar to the unix 'uniq' program, but based on individual 42 fields and without requiring sorted data. This command uniq's on fields 2 and 3: 43 44 tsv-uniq -f 2,3 file.tsv 45 46 The alternate to 'uniq' mode is 'equiv-class' identification. In this mode, all 47 lines are written to standard output, but with a new field added marking 48 equivalent entries with an ID. The ID is simply a one-upped counter. Example: 49 50 tsv-uniq --header -f 2,3 --equiv file.tsv 51 52 tsv-uniq can be run without specifying a key field. In this case the whole line 53 is used as a key, same as the Unix 'uniq' program. This works on any line-oriented 54 text file, not just TSV files. 55 56 The '--r|repeated' option can be used to print only lines occurring more than 57 once. '--a|at-least N' is similar, except that it only prints lines occuring at 58 least N times. For both, the Nth line found is printed, in the order found. 59 60 The '--m|max MAX' option changes the behavior to output the first MAX lines for 61 each key, rather than just the first line for each key. This can also with used 62 with '--e|equiv' to limit the number output for each equivalence class. 63 64 It's not obvious when both '--a|at-least' and '--m|max' might be useful, but, if 65 both are specified, the occurrences between 'at-least' and 'max' are output. 66 67 Options: 68 EOS"; 69 70 /** Container for command line options. 71 */ 72 struct TsvUniqOptions 73 { 74 enum defaultEquivHeader = "equiv_id"; 75 enum defaultEquivStartID = 1; 76 77 string programName; 78 bool helpVerbose = false; // --h|help-verbose 79 bool versionWanted = false; // --V|version 80 size_t[] fields; // --f|fields 81 bool hasHeader = false; // --H|header 82 bool onlyRepeated = false; // --r|repeated. Shorthand for '--atleast 2' 83 size_t atLeast = 0; // --a|at-least. Zero implies default behavior. 84 size_t max = 0; // --m|max. Zero implies default behavior. 85 bool equivMode = false; // --e|equiv 86 string equivHeader = defaultEquivHeader; // --equiv-header 87 long equivStartID = defaultEquivStartID; // --equiv-start 88 bool ignoreCase = false; // --i|ignore-case 89 char delim = '\t'; // --d|delimiter 90 bool keyIsFullLine = false; // Derived. True if no fields specified or '--f|fields 0' 91 92 /* Returns a tuple. First value is true if command line arguments were successfully 93 * processed and execution should continue, or false if an error occurred or the user 94 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 95 * 96 * Returning true (execution continues) means args have been validated and derived 97 * values calculated. In addition, field indices have been converted to zero-based. 98 * If the whole line is the key, the individual fields list will be cleared. 99 * 100 * Repeat count control variables 'atLeast' and max' - The general idea is that the 101 * values are left at zero if no repeat count options are specified. They are set if 102 * repeat count options are specified, as follows: 103 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 104 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 105 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 106 * 'atLeast' if --r|repeated or --a|at-least is provided. 107 * 108 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 109 * represents the default "output all values" case. In this case max may be less 110 * than the at-least value. 111 */ 112 auto processArgs (ref string[] cmdArgs) 113 { 114 import std.algorithm : any, each; 115 import std.getopt; 116 import std.path : baseName, stripExtension; 117 import std.typecons : Yes, No; 118 import tsvutil : makeFieldListOptionHandler; 119 120 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 121 122 try 123 { 124 arraySep = ","; // Use comma to separate values in command line options 125 auto r = getopt( 126 cmdArgs, 127 "help-verbose", " Print full help.", &helpVerbose, 128 std.getopt.config.caseSensitive, 129 "V|version", " Print version information and exit.", &versionWanted, 130 "H|header", " Treat the first line of each file as a header.", &hasHeader, 131 std.getopt.config.caseInsensitive, 132 133 "f|fields", "<field-list> Fields to use as the key. Default: 0 (entire line).", 134 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero), 135 136 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 137 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 138 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 139 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 140 "e|equiv", " Output equiv class IDs rather than uniq'ing entries.", &equivMode, 141 "equiv-header", "STR Use STR as the equiv-id field header. Applies when using '--header --equiv'. Default: 'equiv_id'.", &equivHeader, 142 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 143 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 144 ); 145 146 if (r.helpWanted) 147 { 148 defaultGetoptPrinter(helpText, r.options); 149 return tuple(false, 0); 150 } 151 else if (helpVerbose) 152 { 153 defaultGetoptPrinter(helpTextVerbose, r.options); 154 return tuple(false, 0); 155 } 156 else if (versionWanted) 157 { 158 import tsvutils_version; 159 writeln(tsvutilsVersionNotice("tsv-uniq")); 160 return tuple(false, 0); 161 } 162 163 /* Consistency checks */ 164 if (!equivMode) 165 { 166 if (equivHeader != defaultEquivHeader) 167 { 168 throw new Exception("--equiv-header requires --e|equiv"); 169 } 170 else if (equivStartID != defaultEquivStartID) 171 { 172 throw new Exception("--equiv-start requires --e|equiv"); 173 } 174 } 175 176 if (fields.length > 1 && fields.any!(x => x == 0)) 177 { 178 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields."); 179 } 180 181 /* Derivations */ 182 if (fields.length == 0) 183 { 184 keyIsFullLine = true; 185 } 186 else if (fields.length == 1 && fields[0] == 0) 187 { 188 keyIsFullLine = true; 189 fields.length = 0; 190 } 191 192 if (onlyRepeated && atLeast <= 1) atLeast = 2; 193 if (atLeast >= 2 && max < atLeast) 194 { 195 // Don't modify max if it is zero and equivMode is in effect. 196 if (max != 0 || !equivMode) max = atLeast; 197 } 198 199 fields.each!((ref x) => --x); // Convert to 1-based indexing. 200 201 } 202 catch (Exception exc) 203 { 204 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 205 return tuple(false, 1); 206 } 207 return tuple(true, 0); 208 } 209 } 210 211 /** Main program. Processes command line arguments and calls tsvUniq which implements 212 * the main processing logic. 213 */ 214 int main(string[] cmdArgs) 215 { 216 /* When running in DMD code coverage mode, turn on report merging. */ 217 version(D_Coverage) version(DigitalMars) 218 { 219 import core.runtime : dmd_coverSetMerge; 220 dmd_coverSetMerge(true); 221 } 222 223 TsvUniqOptions cmdopt; 224 auto r = cmdopt.processArgs(cmdArgs); 225 if (!r[0]) return r[1]; 226 try tsvUniq(cmdopt, cmdArgs[1..$]); 227 catch (Exception exc) 228 { 229 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 230 return 1; 231 } 232 return 0; 233 } 234 235 /** Outputs the unique lines from all the input files. 236 * 237 * Processes the lines in each input file. All lines are added to an associated array. 238 * The first time a line is seen it is output. If key fields are being used these are 239 * used as the basis for the associative array entries rather than the full line. 240 */ 241 void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) 242 { 243 import tsvutil : InputFieldReordering, BufferedOutputRange; 244 import std.algorithm : splitter; 245 import std.array : join; 246 import std.conv : to; 247 import std.range; 248 import std.uni : toLower; 249 250 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 251 auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.fields); 252 253 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 254 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 255 256 /* The master hash. The key is the specified fields concatenated together (including 257 * separators). The value is a struct with the equiv-id and occurrence count. 258 */ 259 struct EquivEntry { size_t equivID; size_t count; } 260 EquivEntry[string] equivHash; 261 262 size_t numFields = cmdopt.fields.length; 263 long nextEquivID = cmdopt.equivStartID; 264 bool headerWritten = false; 265 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 266 { 267 auto inputStream = (filename == "-") ? stdin : filename.File(); 268 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 269 { 270 if (cmdopt.hasHeader && lineNum == 1) 271 { 272 /* Header line. */ 273 if (!headerWritten) 274 { 275 bufferedOutput.append(line); 276 if (cmdopt.equivMode) 277 { 278 bufferedOutput.append(cmdopt.delim); 279 bufferedOutput.append(cmdopt.equivHeader); 280 } 281 bufferedOutput.appendln(); 282 headerWritten = true; 283 } 284 } 285 else 286 { 287 /* Regular line (not header). Start by finding the key. */ 288 typeof(line) key; 289 if (cmdopt.keyIsFullLine) 290 { 291 key = line; 292 } 293 else 294 { 295 /* Copy the key fields to a new buffer. */ 296 keyFieldsReordering.initNewLine; 297 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 298 { 299 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 300 if (keyFieldsReordering.allFieldsFilled) break; 301 } 302 303 if (!keyFieldsReordering.allFieldsFilled) 304 { 305 throw new Exception( 306 format("Not enough fields in line. File: %s, Line: %s", 307 (filename == "-") ? "Standard Input" : filename, lineNum)); 308 } 309 310 key = keyFieldsReordering.outputFields.join(cmdopt.delim); 311 } 312 313 if (cmdopt.ignoreCase) key = key.toLower; 314 315 bool isOutput = false; 316 EquivEntry currEntry; 317 EquivEntry* priorEntry = (key in equivHash); 318 if (priorEntry is null) 319 { 320 isOutput = (cmdopt.atLeast <= 1); 321 currEntry.equivID = nextEquivID; 322 currEntry.count = 1; 323 equivHash[key.to!string] = currEntry; 324 nextEquivID++; 325 } 326 else 327 { 328 (*priorEntry).count++; 329 currEntry = *priorEntry; 330 331 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 332 (cmdopt.equivMode && cmdopt.max == 0)) 333 { 334 isOutput = true; 335 } 336 } 337 338 if (isOutput) 339 { 340 bufferedOutput.append(line); 341 if (cmdopt.equivMode) 342 { 343 bufferedOutput.append(cmdopt.delim); 344 bufferedOutput.append(currEntry.equivID.to!string); 345 } 346 bufferedOutput.appendln(); 347 } 348 } 349 } 350 } 351 }