nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Nutch Wiki] Trivial Update of "Automating_Fetches_with_Python" by newacct
Date Sun, 29 Nov 2009 03:19:00 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The "Automating_Fetches_with_Python" page has been changed by newacct.
http://wiki.apache.org/nutch/Automating_Fetches_with_Python?action=diff&rev1=5&rev2=6

--------------------------------------------------

  import sys
  import getopt
  import re
- import string
  import logging
  import logging.config
  import commands
@@ -259, +258 @@

        total_urls += 1
      urllinecount.close()
      numsplits = total_urls / splitsize
-     padding = "0" * len(`numsplits`)
+     padding = "0" * len(repr(numsplits))
  
      # create the url load folder
-     linenum = 0
      filenum = 0
-     strfilenum = `filenum`
+     strfilenum = repr(filenum)
      urloutdir = outdir + "/urls-" + padding[len(strfilenum):] + strfilenum
      os.mkdir(urloutdir)
      urlfile = urloutdir + "/urls"
@@ -275, +273 @@

      outhandle = open(urlfile, "w")
  
      # loop through the file
-     for line in inhandle:
+     for linenum, line in enumerate(inhandle):
  
        # if we have come to a split then close the current file, create a new
        # url folder and open a new url file
-       if linenum > 0 and (linenum % splitsize == 0):
+       if linenum > 0 and linenum % splitsize == 0:
  
-         filenum = filenum + 1
+         filenum += 1
-         strfilenum = `filenum`
+         strfilenum = repr(filenum)
          urloutdir = outdir + "/urls-" + padding[len(strfilenum):] + strfilenum
          os.mkdir(urloutdir)
          urlfile = urloutdir + "/urls"
@@ -290, +288 @@

          outhandle.close()
          outhandle = open(urlfile, "w")
  
-       # write the url to the file and increase the number of lines read
+       # write the url to the file
        outhandle.write(line)
-       linenum = linenum + 1
  
      # close the input and output files
      inhandle.close()
@@ -362, +359 @@

  
          # fetch the current segment
          outar = result[1].splitlines()
-         output = outar[len(outar) - 1]
+         output = outar[-1]
-         tempseg = string.split(output)[0]
+         tempseg = output.split()[0]
          tempseglist.append(tempseg)
          fetch = self.nutchdir + "/bin/nutch fetch " + tempseg
          self.log.info("Starting fetch for: " + tempseg)
@@ -392, +389 @@

  
        # merge the crawldbs
        self.log.info("Merging master and temp crawldbs.")
-       crawlmerge = (self.nutchdir + "/bin/nutch mergedb mergetemp/crawldb " +
+       crawlmerge = self.nutchdir + "/bin/nutch mergedb mergetemp/crawldb " + \
-         mastercrawldbdir + " " + string.join(tempdblist, " "))
+         mastercrawldbdir + " " + " ".join(tempdblist)
        self.log.info("Running: " + crawlmerge)
        result = commands.getstatusoutput(crawlmerge)
        self.checkStatus(result, "Error occurred while running command" + crawlmerge)
@@ -404, +401 @@

        result = commands.getstatusoutput(getsegment)
        self.checkStatus(result, "Error occurred while running command" + getsegment)
        outar = result[1].splitlines()
-       output = outar[len(outar) - 1]
+       output = outar[-1]
-       masterseg = string.split(output)[0]
+       masterseg = output.split()[0]
-       mergesegs = (self.nutchdir + "/bin/nutch mergesegs mergetemp/segments " +
+       mergesegs = self.nutchdir + "/bin/nutch mergesegs mergetemp/segments " + \
-         masterseg + " " + string.join(tempseglist, " "))
+         masterseg + " " + " ".join(tempseglist)
        self.log.info("Running: " + mergesegs)
        result = commands.getstatusoutput(mergesegs)
        self.checkStatus(result, "Error occurred while running command" + mergesegs)
@@ -464, +461 @@

    usage.append("            [-b | --backupdir] The master backup directory, [crawl-backup].\n")
    usage.append("            [-s | --splitsize] The number of urls per load [500000].\n")
    usage.append("            [-f | --fetchmerge] The number of fetches to run before merging
[1].\n")
-   message = string.join(usage)
+   message = " ".join(usage)
    print message
  
  """

Mime
View raw message