2006-09-14 01:00 cppguy 阅读(2627) 评论(3) 编辑 收藏 举报1: 在Windows下调用nutch的脚本,可实现自动运行,这样做可以不用crywin来模拟linux,下面式win xp调用nutch的脚本
@cmd /V:on /c %~dp0nutch1.bat %*
@echo on
rem *********************************************************************
rem * A script to launch nutch on Windows 2000/XP System.
rem *
rem * Written by babatu
rem * blog:
rem *
rem * Because delayed environment is used, cmd /V:on should be used to
rem * run this script.
rem *****************************************************************
if "%OS%"=="Windows_NT" @setlocal
if "%OS%"=="WINNT" @setlocal
if "%1" == "" goto :msg
goto :begin
echo "Usage: nutch COMMAND"
echo "where COMMAND is one of:"
echo " crawl one-step crawler for intranets"
echo " readdb read / dump crawl db"
echo " readlinkdb read / dump link db"
echo " inject inject new urls into the database"
echo " generate generate new segments to fetch"
echo " fetch fetch a segment's pages"
echo " parse parse a segment's pages"
echo " segread read / dump segment data"
echo " updatedb update crawl db from segments after fetching"
echo " invertlinks create a linkdb from parsed segments"
echo " index run the indexer on parsed segments and linkdb"
echo " merge merge several segment indexes"
echo " dedup remove duplicates from a set of segment indexes"
echo " plugin load a plugin and run one of its classes main()"
echo " server run a search server"
echo " or"
echo " CLASSNAME run the class named CLASSNAME"
echo "Most commands print help when invoked w/o parameters."
goto :end
rem %~dp0 这个脚本的扩展path ( expanded pathname of the current script under NT)
rem set _USE_CLASSPATH=yes
if "%CLASSPATH%"=="" ( set CLASSPATH=%JAVA_HOME%\lib\tools.jar) ELSE set
echo before other
rem for developers, add plugins, job & test code to CLASSPATH
if exist %NUTCH_HOME%\build\plugins set
for /R %NUTCH_HOME%\build %%i in (nutch*.job) do set
if exist %NUTCH_HOME%\build\test\classes set
rem for releases, add Nutch job to CLASSPATH
for /R %NUTCH_HOME% %%i in (nutch*.job) do set CLASSPATH=!CLASSPATH!;%%i
rem add plugins to classpath
rem add libs to CLASSPATH
for /R %NUTCH_HOME%\lib %%f in (*.jar) do set CLASSPATH=!CLASSPATH!;%%f
rem translate command
if "%1"=="crawl" set CLASS=org.apache.nutch.crawl.Crawl
if "%1"=="inject" set CLASS=org.apache.nutch.crawl.Injector
if "%1"=="generate" set CLASS=org.apache.nutch.crawl.Generator
if "%1"=="fetch" set CLASS=org.apache.nutch.fetcher.Fetcher
if "%1"=="parse" set CLASS=org.apache.nutch.parse.ParseSegment
if "%1"=="readdb" set CLASS=org.apache.nutch.crawl.CrawlDbReader
if "%1"=="readlinkdb" set CLASS=org.apache.nutch.crawl.LinkDbReader
if "%1"=="segread" set CLASS=org.apache.nutch.segment.SegmentReader
if "%1"=="updatedb" set CLASS=org.apache.nutch.crawl.CrawlDb
if "%1"=="invertlinks" set CLASS=org.apache.nutch.crawl.LinkDb
if "%1"=="index" set CLASS=org.apache.nutch.indexer.Indexer
if "%1"=="dedup" set CLASS=org.apache.nutch.indexer .DeleteDuplicates
if "%1"=="merge" set CLASS=org.apache.nutch.indexer.IndexMerger
if "%1"=="plugin" set CLASS=org.apache.nutch.plugin.PluginRepository
if "%1"=="server" set CLASS='
if "%CLASS%"=="" set CLASS=%1
%JAVA_HOME%\bin\java -cp %CLASSPATH% %CLASS% %*
if "%OS%"=="Windows_NT" @endlocal
if "%OS%"=="WINNT" @endlocal
# Set JAVA_HOME to reflect your systems java configuration
export JAVA_HOME=/usr/lib/j2sdk1.5-sun
# Start index updation,只查找最热门的前1000条记录,由此创建新的segment
bin/nutch generate crawl.mydomain/db crawl.mydomain/segments -topN 1000
s=`ls -d crawl.virtusa/segments/2* | tail -1`
echo Segment is $s
bin/nutch fetch $s
bin/nutch updatedb crawl.mydomain /db $s
bin/nutch analyze crawl.mydomain /db 5
bin/nutch index $s
bin/nutch dedup crawl.mydomain /segments crawl.mydomain/tmpfile
# Merge segments to prevent too many open files exception in Lucene
bin/nutch mergesegs -dir crawl.mydomain/segments -i -ds
s=`ls -d crawl.mydomain/segments/2* | tail -1`
echo Merged Segment is $s
rm -rf crawl.mydomain/index
#bin/nutch inject crawl.mydomain/db -urlfile urls
3:使用shell scripts实现nutch的自动运行需要shell。这里还有一种使用快速
import os, sys, glob # The Nutch command script # # Environment Variables # # NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. # # NUTCH_HEAPSIZE The maximum amount of heap to use, in MB. # Default is 1000. # # NUTCH_OPTS Extra Java runtime options. # # ported to python by Ben Ogle (ogle dot ben [at] gmail) #does not handle links. thisdir = os.getcwd() cpsep = ":" if( == "nt" ): cpsep = ";" if( len(sys.argv) == 1 ): print "Usage: python COMMAND" print "where COMMAND is one of:" print " crawl one-step crawler for intranets" print " readdb read / dump crawl db" print " mergedb merge crawldb-s, with optional filtering" print " readlinkdb read / dump link db" print " inject inject new urls into the database" print " generate generate new segments to fetch" print " fetch fetch a segment's pages" print " parse parse a segment's pages" print " segread read / dump segment data" print " mergesegs merge several segments, with optional filtering and slicing" print " updatedb update crawl db from segments after fetching" print " invertlinks create a linkdb from parsed segments" print " mergelinkdb merge linkdb-s, with optional filtering" print " index run the indexer on parsed segments and linkdb" print " merge merge several segment indexes" print " dedup remove duplicates from a set of segment indexes" print " plugin load a plugin and run one of its classes main()" print " server run a search server" print " or" print " CLASSNAME run the class named CLASSNAME" print "Most commands print help when invoked w/o parameters." sys.exit(1) command = sys.argv[1] #print "COMMAND: " + command nutch_home = thisdir + "/.." java_home = os.getenv("NUTCH_JAVA_HOME") if(java_home != None): os.setenv("JAVA_HOME", java_home) print java_home java_home = os.getenv("JAVA_HOME") if(java_home == None): print "Error: JAVA_HOME is not set." exit(1) java = java_home + "/bin/java.exe" java_heap_max = "-Xmx1000m" nutch_heap_sz = os.getenv("NUTCH_HEAPSIZE") if(nutch_heap_sz != None): java_heap_max = "-Xmx"+ nutch_heap_sz +"m" #print java_heap_max classpath = nutch_home + "/conf" classpath = classpath + cpsep + nutch_home + "/lib/tools.jar" # for developers, add plugins, job & test code to CLASSPATH if( os.path.exists( nutch_home + "/build/plugins" ) ): classpath = classpath + cpsep + nutch_home + "/build/plugins" flist = glob.glob(nutch_home + "/build/nutch-*.job") for l in flist: classpath = classpath + cpsep + l if( os.path.exists( nutch_home + "/build/test/classes" ) ): classpath = classpath + cpsep + nutch_home + "/build/test/classes" flist = glob.glob(nutch_home + "/nutch-*.job") for l in flist: classpath = classpath + cpsep + l if( os.path.exists( nutch_home + "/plugins" ) ): classpath = classpath + cpsep + nutch_home + "/plugins" flist = glob.glob(nutch_home + "/lib/*.jar") for l in flist: classpath = classpath + cpsep + l flist = glob.glob(nutch_home + "/lib/jetty-ext/*.jar") for l in flist: classpath = classpath + cpsep + l #print classpath nutch_log_dir = os.getenv("NUTCH_LOG_DIR") if(nutch_log_dir == None): nutch_log_dir = nutch_home + "/logs" nutch_log_file = os.getenv("NUTCH_LOGFILE") if(nutch_log_file == None): nutch_log_file = "hadoop.log" nutch_opts = os.getenv("NUTCH_OPTS") if( nutch_opts == None ): nutch_opts = "" nutch_opts = nutch_opts + " -Dhadoop.log.dir=" + nutch_log_dir nutch_opts = nutch_opts + " -Dhadoop.log.file=" + nutch_log_file # figure out which class to run theclass = command if ( command == "crawl" ): theclass="org.apache.nutch.crawl.Crawl" elif ( command == "inject" ): theclass="org.apache.nutch.crawl.Injector" elif ( command == "generate" ): theclass="org.apache.nutch.crawl.Generator" elif ( command == "fetch" ): theclass="org.apache.nutch.fetcher.Fetcher" elif ( command == "parse" ): theclass="org.apache.nutch.parse.ParseSegment" elif ( command == "readdb" ): theclass="org.apache.nutch.crawl.CrawlDbReader" elif ( command == "mergedb" ): theclass="org.apache.nutch.crawl.CrawlDbMerger" elif ( command == "readlinkdb" ): theclass="org.apache.nutch.crawl.LinkDbReader" elif ( command == "segread" ): theclass="org.apache.nutch.segment.SegmentReader" elif ( command == "mergesegs" ): theclass="org.apache.nutch.segment.SegmentMerger" elif ( command == "updatedb" ): theclass="org.apache.nutch.crawl.CrawlDb" elif ( command == "invertlinks" ): theclass="org.apache.nutch.crawl.LinkDb" elif ( command == "mergelinkdb" ): theclass="org.apache.nutch.crawl.LinkDbMerger" elif ( command == "index" ): theclass="org.apache.nutch.indexer.Indexer" elif ( command == "dedup" ): theclass="org.apache.nutch.indexer.DeleteDuplicates" elif ( command == "merge" ): theclass="org.apache.nutch.indexer.IndexMerger" elif ( command == "plugin" ): theclass="org.apache.nutch.plugin.PluginRepository" elif ( command == "server" ): #what goes in place of the $Server? theclass="org.apache.nutch.searcher.DistributedSearch$Server" args = "" for i in range(2, len(sys.argv)): args = args + " " + sys.argv[i] #windows doesnt like this even though there are quotes around it... #"\"" + java +"\" " cmdtorun = "java " + java_heap_max + " " + nutch_opts + " -classpath \"" + classpath + "\" " + theclass + args #print cmdtorun os.system(cmdtorun)