# File convertdb.rb, line 75 def main $stderr.sync = $stdout.sync = true header "WordNet Lexicon Converter" errorLimit = 0 ARGV.options {|oparser| oparser.banner = "Usage: #{File::basename($0)} -dv\n" # Debugging on/off oparser.on( "--debug", "-d", TrueClass, "Turn debugging on" ) { $DEBUG = true debugMsg "Turned debugging on." } # Verbose oparser.on( "--verbose", "-v", TrueClass, "Verbose progress messages" ) { $VERBOSE = true debugMsg "Turned verbose on." } # Error-limit oparser.on( "--error-limit=COUNT", "-eCOUNT", Integer, "Error limit -- quit after COUNT errors" ) {|arg| errorLimit = arg.to_i debugMsg "Set error limit to #{errorLimit}" } # Handle the 'help' option oparser.on( "--help", "-h", "Display this text." ) { $stderr.puts oparser exit!(0) } oparser.parse! } # Make sure the user knows what they're in for message "This program will convert WordNet data files into databases\n"\ "used by Ruby-WordNet. This will not affect existing WordNet files,\n"\ "but will require up to 40Mb of disk space.\n" exit unless /^y/i =~ promptWithDefault("Continue?", "y") # Open the database and check to be sure it's empty. Confirm overwrite if # not. Checkpoint and set up logging proc if debugging. if File::exists?( WordNet::Lexicon::DbFile ) message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\ "will be overwritten.\n" abort( "user cancelled." ) unless /^y/i =~ promptWithDefault( "Continue?", "n" ) FileUtils::rm_rf( WordNet::Lexicon::DbFile ) end # Find the source data files if ARGV.empty? message "Where can I find the WordNet data files?\n" datadir = promptWithDefault( "Data directory", "/usr/local/WordNet-2.0/dict" ) else datadir = ARGV.shift end abort( "Directory '#{datadir}' does not exist" ) unless File::exists?( datadir ) abort( "'#{datadir}' is not a directory" ) unless File::directory?( datadir ) testfile = File::join(datadir, "data.noun") abort( "'#{datadir}' doesn't seem to contain the necessary files.") unless File::exists?( testfile ) # Open the lexicon, which creates a new database under lib/wordnet/lexicon. lexicon = WordNet::Lexicon::new # Process each fileset [ # Fileset, name, database handle, processor Fileset::new( IndexFiles, "index", lexicon.indexDb, method(:parseIndexLine) ), Fileset::new( MorphFiles, "morph", lexicon.morphDb, method(:parseMorphLine) ), Fileset::new( DataFiles, "data", lexicon.dataDb, method(:parseSynsetLine) ), ].each {|set| message "Converting %s files...\n" % set.name set.db.truncate # Process each file in the set with the appropriate processor method and # insert results into the corresponding table. set.files.each {|file,pos| message " #{file}..." filepath = File::join( datadir, file ) if !File::exists?( filepath ) message "missing: skipped\n" next end txn, dbh = lexicon.env.txn_begin( 0, set.db ) entries = lineNumber = errors = 0 File::readlines( filepath ).each {|line| lineNumber += 1 next if /^\s/ =~ line key, value = set.processor.call( line.chomp, lineNumber, pos ) unless key errors += 1 if errorLimit.nonzero? && errors >= errorLimit abort( "Too many errors" ) end end dbh[ key ] = value entries += 1 print "%d%s" % [ entries, "\x08" * entries.to_s.length ] # Commit and start a new transaction every 1000 records if (entries % CommitThreshold).nonzero? txn.commit( BDB::TXN_NOSYNC ) txn, dbh = lexicon.env.txn_begin( 0, set.db ) end } message "committing..." txn.commit( BDB::TXN_SYNC ) message "done (%d entries, %d errors).\n" % [ entries, errors ] } message "Checkpointing DB and cleaning logs..." lexicon.checkpoint lexicon.cleanLogs puts "done." } message "done.\n\n" end