nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "behnam nikbakht (Created) (JIRA)" <j...@apache.org>
Subject [jira] [Created] (NUTCH-1199) unfetched URLs problem
Date Tue, 08 Nov 2011 06:30:53 GMT
unfetched URLs problem
----------------------

                 Key: NUTCH-1199
                 URL: https://issues.apache.org/jira/browse/NUTCH-1199
             Project: Nutch
          Issue Type: Improvement
          Components: fetcher, generator
            Reporter: behnam nikbakht
            Priority: Critical


we write a script to fetch unfetched urls:
#first dump from readdb to a text file, and extract unfetched urls to a text file:
        bin/nutch readdb $crawldb -dump $SITE_DIR/tmp/dump_urls.txt -format csv
        cat $SITE_DIR/tmp/dump_urls.txt/part-00000 | grep db_unfetched > $SITE_DIR/tmp/dump_unf
        unfetched_urls_file="$SITE_DIR/tmp/unfetched_urls/unfetched_urls.txt"
        cat $SITE_DIR/tmp/dump_unf | awk -F '"' '{print $2}' >  $unfetched_urls_file

        unfetched_count=`cat $unfetched_urls_file|wc -l`
#next, we have a list of unfetched urls in unfetched_urls.txt , then, we use command freegen
to create segments for #these urls, we can not use command generate because these url's were
generated previously
       if [[ $unfetched_count -lt $it_size ]]

       then
                        echo "UNFETCHED $J , $it_size URLs from $unfetched_count generated"
                        ((J++))
                        bin/nutch freegen $SITE_DIR/tmp/unfetched_urls/unfetched_urls.txt
$crawlseg
                        s2=`ls -d $crawlseg/2* | tail -1`
                        bin/nutch fetch $s2
                        bin/nutch parse $s2
                        bin/nutch updatedb $crawldb $s2
                        echo "bin/nutch updatedb $crawldb $s2" >> $SITE_DIR/updatedblog.txt
                        get_new_links
                        exit
       fi
# if number of urls are greater than it_size, then package them
        ij=1
        while read line
        do
                let "ind = $ij / $it_size"
                mkdir $SITE_DIR/tmp/unfetched_urls/unfetched_urls$ind/
                echo $line >> $SITE_DIR/tmp/unfetched_urls/unfetched_urls$ind/unfetched_urls$ind.txt
                echo $ind
                ((ij++))
                let "completed=$ij % $it_size"
               if [[ $completed -eq 0 ]]

               then
                                                                  echo "UNFETCHED $J , $it_size
URLs from $unfetched_count generated"
                        ((J++))
                        bin/nutch freegen $SITE_DIR/tmp/unfetched_urls/unfetched_urls$ind/unfetched_urls$ind.txt
$crawlseg
#finally fetch,parse and update new segment
                        s2=`ls -d $crawlseg/2* | tail -1`
                        bin/nutch fetch $s2
                        bin/nutch parse $s2
                        rm $crawldb/.locked
                        bin/nutch updatedb $crawldb $s2
                        echo "bin/nutch updatedb $crawldb $s2" >> $SITE_DIR/updatedblog.txt
               fi
        done <$unfetched_urls_file


--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira

        

Mime
View raw message