nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2491) Integrate sitemap processing and HostDB into crawl script
Date Wed, 03 Jan 2018 17:35:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2491?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16309956#comment-16309956
] 

ASF GitHub Bot commented on NUTCH-2491:
---------------------------------------

lewismc closed pull request #270: NUTCH-2491: Integrate sitemap processing and HostDB into
crawl script
URL: https://github.com/apache/nutch/pull/270
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/bin/crawl b/src/bin/crawl
index da169353a..c92e5b46f 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>] <Crawl
Dir> <Num Rounds>
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <SeedDir>] [-sm <SitemapDir>]
<CrawlDir> <NumRounds>
 #    -i|--index      Indexes crawl results into a configured indexer
 #    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no
URLs
 #                    are scheduled for fetching. Suffix can be: s for second,
@@ -23,8 +23,9 @@
 #                    specified second is used by default.
 #    -D              A Java property to pass to Nutch calls
 #    -s              Path to seeds file(s)
-#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
-#    Num Rounds      The number of rounds to run this crawl for
+#    -sm             Path to sitemap URL file(s)
+#    CrawlDir        Directory where the crawl/link/segments dirs are saved
+#    NumRounds       The number of rounds to run this crawl for
 #
 #
 # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
@@ -73,6 +74,10 @@ do
             SEEDDIR="${2}"
             shift 2
             ;;
+        -sm)
+            SITEMAPDIR="${2}"
+            shift 2
+            ;;
         -w|--wait)
             WAIT="${2}"
             shift 2
@@ -84,16 +89,17 @@ do
 done
 
 if [[ $# != 2 ]]; then
-    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed Dir>]
<Crawl Dir> <Num Rounds>"
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <SeedDir>] [-sm
<SitemapDir>] <CrawlDir> <NumRounds>"
     echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
     echo -e "\t-D\t\tA Java property to pass to Nutch calls"
     echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when
no URLs"
     echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
     echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
     echo -e "\t\t\tspecified second is used by default."
-    echo -e "\t-s Seed Dir\tPath to seeds file(s)"
-    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
-    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+    echo -e "\t-s SeedDir\tPath to seeds file(s)"
+    echo -e "\t-sm SitemapDir\tPath to sitemap URL file(s)"
+    echo -e "\tCrawlDir\tDirectory where the crawl/host/link/segments dirs are saved"
+    echo -e "\tNumRounds\tThe number of rounds to run this crawl for"
     exit 1
 fi
 
@@ -168,19 +174,23 @@ function __bin_nutch {
 }
 
 # initial injection
-if [[ !  -z  $SEEDDIR  ]]
-then 
+if [[ !  -z  $SEEDDIR  ]]; then
   echo "Injecting seed URLs"
   __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
 fi
 
+# sitemap processing based on sitemap definition file(s)
+if [[ ! -z $SITEMAPDIR ]]; then
+  echo "Processing sitemaps defined in $SITEMAPDIR"
+  __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $numThreads
+fi
+
 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; ; a++))
 do
-  if [ -e ".STOP" ]
-  then
-   echo "STOP file found - escaping loop"
-   break
+  if [ -e ".STOP" ]; then
+    echo "STOP file found - escaping loop"
+    break
   fi
 
   if [ $LIMIT -ne -1 ]; then
@@ -193,6 +203,18 @@ do
     echo `date` ": Iteration $a"
   fi
 
+  # create / update HostDB
+  if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+    echo "Updating HostDB"
+    __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
+  fi
+
+  # sitemap processing based on HostDB
+  if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+    echo "Processing sitemaps based on hosts in HostDB"
+    __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $numThreads
+  fi
+
   echo "Generating a new segment"
   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist
-numFetchers $numSlaves -noFilter)
   echo "$bin/nutch generate ${generate_args[@]}"


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Integrate sitemap processing and HostDB into crawl script
> ---------------------------------------------------------
>
>                 Key: NUTCH-2491
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2491
>             Project: Nutch
>          Issue Type: Improvement
>            Reporter: Moreno Feltscher
>            Assignee: Moreno Feltscher
>            Priority: Minor
>             Fix For: 1.15
>
>
> Add three new steps to the crawl bash script:
> 1. Generate HostDB from CrawlDB
> 2. Inject URLs from sitemaps URLs found in hosts from HostDb
> 3. If given, inject sitemap URLs specified in a configuration file / in configuration
files



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message