nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Nutch Wiki] Update of "Tutorial on incremental crawling" by Gabriele Kahlout
Date Sun, 27 Mar 2011 12:45:34 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The "Tutorial on incremental crawling" page has been changed by Gabriele Kahlout.
http://wiki.apache.org/nutch/Tutorial%20on%20incremental%20crawling?action=diff&rev1=2&rev2=3

--------------------------------------------------

  
  == 1. Abridged script using Solr ==
  == 2. Unabridged script with explanations and using nutch index ==
+ {{{
- {{{#!/bin/sh
+ #!/bin/sh
+ 
  #
  # Created by Gabriele Kahlout on 27.03.11.
+ # The following script crawls the whole-web incrementally; Specifying a list of urls to
crawl, nutch will continuously fetch $it_size urls from a specified list of urls, index and
merge them with our whole-web index,  so that they can be immediately searched, until all
urls have been fetched.
  #
+ # TO USE:
- # The following script crawls the whole-web incrementally; Specifying a list of urls to
crawl, nutch will continuously fetch $it_size urls from a
- # specified list of urls, index and merge them with our whole-web index,  so that they can
be immediately searched, until all urls have been fetched.
- #
- # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration
depth]
- #
- # Getting Started:
  # 1. $ mv whole-web-crawling-incremental $NUTCH_HOME/whole-web-crawling-incremental
  # 2. $ cd $NUTCH_HOME
  # 3. $ chmod +x whole-web-crawling-incremental
  # 4. $ ./whole-web-crawling-incremental
- #
+ 
+ # Usage: ./whole-web-crawling-incremental [it_seedsDir-path urls-to-fetch-per-iteration
depth]
  # Start
+ 
  function echoThenRun () { # echo and then run the command
    echo $1
    $1
    echo
  }
+ 
  echoThenRun "rm -r crawl" # fresh crawl
+ 
  if [[ ! -d "build" ]]
  then
-     echoThenRun "ant"
+ 	echoThenRun "ant"
  fi
+ 
  seedsDir="seeds"
  if [[ $1 != "" ]]
  then
-     seedsDir=$1
+ 	seedsDir=$1
  fi
+ 
  it_size=10
  if [[ $2 != "" ]]
  then
-     it_size=$2
+ 	it_size=$2
  fi
+ 
  indexedPlus1=1 #indexedPlus1 urls+1 because of tail. Never printed out
  it_seedsDir="$seedsDir/it_seeds"
  rm -r $it_seedsDir
  mkdir $it_seedsDir
+ 
  allUrls=`cat $seedsDir/*url* | wc -l | sed -e "s/^ *//"`
  echo $allUrls" urls to crawl"
+ 
  it_crawldb="crawl/crawldb"
+ 
  depth=1
  if [[ $3 != "" ]]
  then
-     depth=$3
+ 	depth=$3
  fi
+ 
  while [[ $indexedPlus1 -le $allUrls ]] #repeat generate-fetch-updatedb-invertlinks-index-merge
loop until all urls are fetched
  do
-     rm $it_seedsDir/urls
+ 	rm $it_seedsDir/urls
-     tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls
+ 	tail -n+$indexedPlus1 $seedsDir/*url* | head -n$it_size > $it_seedsDir/urls
-     echo
+ 	echo
-     echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
+ 	echoThenRun "bin/nutch inject $it_crawldb $it_seedsDir"
-     i=0
+ 	i=0
+ 	
+ 	while [[ $i -lt $depth ]] # depth-first
+ 	do
+ 		echo
+ 		echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":"
+ 		
+ 		echo
+ 		cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size"
+ 		echo $cmd
+ 		output=`$cmd`
+ 		echo $output
+ 		if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration
have been fetched
+ 		then
+ 			break;
+ 		fi
+ 		s1=`ls -d crawl/segments/2* | tail -1`
  
-     while [[ $i -lt $depth ]] # depth-first
-     do
-         echo
-         echo "generate-fetch-updatedb-invertlinks-index-merge iteration "$i":"
-         echo
- 	cmd="bin/nutch generate $it_crawldb crawl/segments -topN $it_size"
- 	echo $cmd
- 	output=`$cmd`
- 	echo $output
-         if [[ $output == *'0 records selected for fetching'* ]] #all the urls of this iteration
have been fetched
-         then
-             break;
-         fi
-         s1=`ls -d crawl/segments/2* | tail -1`
-         echoThenRun "bin/nutch fetch $s1"
+ 		echoThenRun "bin/nutch fetch $s1"
+ 
-         echoThenRun "bin/nutch updatedb $it_crawldb $s1"
+ 		echoThenRun "bin/nutch updatedb $it_crawldb $s1"
+ 
-         echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments"
+ 		echoThenRun "bin/nutch invertlinks crawl/linkdb -dir crawl/segments"
+ 
+ 
-         # echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb
crawl/segments/*"
+ 		# echoThenRun "bin/nutch solrindex http://localhost:8080/solr/ $it_crawldb crawl/linkdb
crawl/segments/*"
-         # if you have solr setup you can use it by uncommenting the above command and commenting
the following nutch index and merge step.
+ 		# if you have solr setup you can use it by uncommenting the above command and commenting
the following nutch index and merge step.
+ 
-         # start nutch index and merge step
+ 		# start nutch index and merge step
-         new_indexes="crawl/new_indexes"
+ 		new_indexes="crawl/new_indexes"
-         rm -r $new_indexes $temp_indexes
+ 		rm -r $new_indexes $temp_indexes
-         echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*"
+ 		echoThenRun "bin/nutch index $new_indexes $it_crawldb crawl/linkdb crawl/segments/*"
-         indexes="crawl/indexes"
+ 		indexes="crawl/indexes"
-         temp_indexes="crawl/temp_indexes"
+ 		temp_indexes="crawl/temp_indexes"
+ 
-         # solrindex also merged, with nutch index we've to do it:
+ 		# solrindex also merged, with nutch index we've to do it:
-         echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around
for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available)
+ 		echoThenRun "bin/nutch merge $temp_indexes/part-1 $indexes $new_indexes" # work-around
for https://issues.apache.org/jira/browse/NUTCH-971 (Patch available)
+ 
-         rm -r $indexes $new_indexes
+ 		rm -r $indexes $new_indexes
-         mv $temp_indexes $indexes
+ 		mv $temp_indexes $indexes
+ 
-         # end nutch index and merge step
+ 		# end nutch index and merge step
+ 
-         # you can now search the index with http://localhost:8080/solr/admin/ (if setup)
or http://code.google.com/p/luke/ . The index is stored in crawl/indexes, while if Solr is
used then in $NUTCH_HOME/solr/data/index.
+ 		# you can now search the index with http://localhost:8080/solr/admin/ (if setup) or http://code.google.com/p/luke/
. The index is stored in crawl/indexes, while if Solr is used then in $NUTCH_HOME/solr/data/index.
-         ((i++))
+ 		((i++))
-         ((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of
actually fetched, but (! going to fetch a page) --> infinite loop
+ 		((indexedPlus1+=$it_size)) # maybe should readdb crawl/crawldb -stats number of actually
fetched, but (! going to fetch a page) --> infinite loop
-     done
+ 	done
+ 
-     echoThenRun "bin/nutch readdb $it_crawldb -stats"
+ 	echoThenRun "bin/nutch readdb $it_crawldb -stats"
+ 
-     allcrawldb="crawl/allcrawldb"
+ 	allcrawldb="crawl/allcrawldb"
-     temp_crawldb="crawl/temp_crawldb"
+ 	temp_crawldb="crawl/temp_crawldb"
-     merge_dbs="$it_crawldb $allcrawldb"
+ 	merge_dbs="$it_crawldb $allcrawldb"
+ 
-     # work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available)
+ 	# work-around for https://issues.apache.org/jira/browse/NUTCH-972 (Patch available)
-     if [[ ! -d $allcrawldb ]]
+ 	if [[ ! -d $allcrawldb ]]
-     then
+ 	then
-         merge_dbs="$it_crawldb"
+ 		merge_dbs="$it_crawldb"
-     fi
+ 	fi
+ 
-     echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
+ 	echoThenRun "bin/nutch mergedb $temp_crawldb $merge_dbs"
+ 
-     rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
+ 	rm -r $allcrawldb $it_crawldb crawl/segments crawl/linkdb
-     mv $temp_crawldb $allcrawldb
+ 	mv $temp_crawldb $allcrawldb
  done
+ 
  echo
  crawl_dump="$allcrawldb/dump"
+ 
  rm -r $crawl_dump $it_seedsDir
  echoThenRun "bin/nutch readdb $allcrawldb -dump $crawl_dump" # you can inspect the dump
with $ vim $crawl_dump
  bin/nutch readdb $allcrawldb -stats

Mime
View raw message