代码改变世界

Nutch的自动运行

2006-09-14 01:00  cppguy  阅读(2627)  评论(3编辑  收藏  举报

   1: 在Windows下调用nutch的脚本,可实现自动运行,这样做可以不用crywin来模拟linux,下面式win xp调用nutch的脚本

 

nutch.bat
@cmd /V:on /c %~dp0nutch1.bat %*

nutch1.bat
@echo on
rem *********************************************************************

rem * A script to launch nutch on Windows 2000/XP System.
rem *
rem * Written by babatu
rem * babatu@gmail.com blog: blog.babatu.com
rem *
rem * Because delayed environment is used, cmd /V:on should be used to
rem * run this script.
rem *****************************************************************
if "%OS%"=="Windows_NT" @setlocal
if "%OS%"=="WINNT" @setlocal

if "%1" == "" goto :msg
goto :begin
:msg
echo "Usage: nutch COMMAND"
 echo "where COMMAND is one of:"
 echo "  crawl             one-step crawler for intranets"
 echo "  readdb            read / dump crawl db"
 echo "  readlinkdb        read / dump link db"
 echo "  inject            inject new urls into the database"
 echo "  generate          generate new segments to fetch"
 echo "  fetch             fetch a segment's pages"
 echo "  parse             parse a segment's pages"
 echo "  segread           read / dump segment data"
 echo "  updatedb          update crawl db from segments after fetching"
 echo "  invertlinks       create a linkdb from parsed segments"
 echo "  index             run the indexer on parsed segments and linkdb"
 echo "  merge             merge several segment indexes"
 echo "  dedup             remove duplicates from a set of segment indexes"
 echo "  plugin            load a plugin and run one of its classes main()"
 echo "  server            run a search server"
 echo " or"
 echo "  CLASSNAME         run the class named CLASSNAME"
 echo "Most commands print help when invoked w/o parameters."
pause
goto :end

:begin
rem %~dp0 这个脚本的扩展path   ( expanded pathname of the current script under NT)
set DEFAULT_NUTCH_HOME=%~dp0..
rem set DEFAULT_NUTCH_HOME=..

if "%NUTCH_HOME%"=="" set NUTCH_HOME=%DEFAULT_NUTCH_HOME%
set DEFAULT_NUTCH_HOME=""
rem 设置默认DEFAULT_NUTCH_HOME

echo %NUTCH_HOME%

rem set _USE_CLASSPATH=yes

if "%CLASSPATH%"=="" ( set CLASSPATH=%JAVA_HOME%\lib\tools.jar) ELSE set
CLASSPATH=%CLASSPATH%;%JAVA_HOME%\lib\tools.jar
set CLASSPATH=%CLASSPATH%;%NUTCH_HOME%\conf;
echo %CLASSPATH%
echo before other

rem for developers, add plugins, job & test code to CLASSPATH
if exist %NUTCH_HOME%\build\plugins set
CLASSPATH=%CLASSPATH%;%NUTCH_HOME%\build

for /R %NUTCH_HOME%\build %%i in (nutch*.job) do set
CLASSPATH=!CLASSPATH!;%%i
if exist %NUTCH_HOME%\build\test\classes set
CLASSPATH=%CLASSPATH%;%NUTCH_HOME%\build\test\classes

rem for releases, add Nutch job to CLASSPATH
for /R %NUTCH_HOME% %%i in (nutch*.job) do set CLASSPATH=!CLASSPATH!;%%i
rem add plugins to classpath
if exist %NUTCH_HOME%\plugins set CLASSPATH=%CLASSPATH%;%NUTCH_HOME%
rem add libs to CLASSPATH
for /R %NUTCH_HOME%\lib %%f in (*.jar) do set CLASSPATH=!CLASSPATH!;%%f

echo %CLASSPATH%

rem translate command
if "%1"=="crawl" set CLASS=org.apache.nutch.crawl.Crawl
if "%1"=="inject" set   CLASS=org.apache.nutch.crawl.Injector
if "%1"=="generate" set   CLASS=org.apache.nutch.crawl.Generator
if "%1"=="fetch" set   CLASS=org.apache.nutch.fetcher.Fetcher
if "%1"=="parse" set   CLASS=org.apache.nutch.parse.ParseSegment
if "%1"=="readdb" set   CLASS=org.apache.nutch.crawl.CrawlDbReader
if "%1"=="readlinkdb" set   CLASS=org.apache.nutch.crawl.LinkDbReader
if "%1"=="segread" set   CLASS=org.apache.nutch.segment.SegmentReader
if "%1"=="updatedb" set   CLASS=org.apache.nutch.crawl.CrawlDb
if "%1"=="invertlinks" set   CLASS=org.apache.nutch.crawl.LinkDb
if "%1"=="index" set   CLASS=org.apache.nutch.indexer.Indexer
if "%1"=="dedup" set   CLASS=org.apache.nutch.indexer .DeleteDuplicates
if "%1"=="merge" set   CLASS=org.apache.nutch.indexer.IndexMerger
if "%1"=="plugin" set   CLASS=org.apache.nutch.plugin.PluginRepository
if "%1"=="server" set CLASS='
org.apache.nutch.searcher.DistributedSearch$Server'
if "%CLASS%"=="" set CLASS=%1

%JAVA_HOME%\bin\java -cp %CLASSPATH% %CLASS% %*

if "%OS%"=="Windows_NT" @endlocal
if "%OS%"=="WINNT" @endlocal

:end

 

 

 

2:

写一个维护脚本,定时运行

  #!/bin/bash

  # Set JAVA_HOME to reflect your systems java configuration
  export JAVA_HOME=/usr/lib/j2sdk1.5-sun 

  # Start index updation,只查找最热门的前1000条记录,由此创建新的segment
  bin/nutch generate crawl.mydomain/db crawl.mydomain/segments -topN 1000
 #得到最新的segment目录名
  s=`ls -d crawl.virtusa/segments/2* | tail -1`
  echo Segment is $s
  bin/nutch fetch $s
  bin/nutch updatedb crawl.mydomain /db $s
  bin/nutch analyze crawl.mydomain /db 5
  bin/nutch index $s
 #删除重复记录
  bin/nutch dedup crawl.mydomain /segments crawl.mydomain/tmpfile

  # Merge segments to prevent too many open files exception in Lucene
  #合并成一个新的segment
  bin/nutch mergesegs -dir crawl.mydomain/segments -i -ds
  s=`ls -d crawl.mydomain/segments/2* | tail -1`
  echo Merged Segment is $s

  rm -rf crawl.mydomain/index

  以上是在urls文件内容没有变化的时候采用的办法,如果我加入的新的URL在urls文件里,那么在运行generate以前,要执行下面一命令:
#bin/nutch inject crawl.mydomain/db -urlfile urls
在generate的时候,如果不加topN参数,那么crawl只会去处理新加的或原来由于其它原因没有fetch的url或page,所以我感觉,脚本1和用2修改的脚本交替运行,会有很好的效果。

摘录自http://www.lvban.com/blog/computer/index.html

 

 

3:使用shell scripts实现nutch的自动运行需要shell。这里还有一种使用快速

python的nutch脚本

 

import os, sys, glob
# The Nutch command script
#
# Environment Variables
#
#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
#
#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB.
#                   Default is 1000.
#
#   NUTCH_OPTS      Extra Java runtime options.
#
# ported to python by Ben Ogle (ogle dot ben [at] gmail)
#does not handle links.
thisdir = os.getcwd()
cpsep = ":"
if( os.name == "nt" ):
cpsep = ";"
if( len(sys.argv) == 1 ):
print "Usage: python nutch.py COMMAND"
print "where COMMAND is one of:"
print "  crawl             one-step crawler for intranets"
print "  readdb            read / dump crawl db"
print "  mergedb           merge crawldb-s, with optional filtering"
print "  readlinkdb        read / dump link db"
print "  inject            inject new urls into the database"
print "  generate          generate new segments to fetch"
print "  fetch             fetch a segment's pages"
print "  parse             parse a segment's pages"
print "  segread           read / dump segment data"
print "  mergesegs         merge several segments, with optional filtering and slicing"
print "  updatedb          update crawl db from segments after fetching"
print "  invertlinks       create a linkdb from parsed segments"
print "  mergelinkdb       merge linkdb-s, with optional filtering"
print "  index             run the indexer on parsed segments and linkdb"
print "  merge             merge several segment indexes"
print "  dedup             remove duplicates from a set of segment indexes"
print "  plugin            load a plugin and run one of its classes main()"
print "  server            run a search server"
print " or"
print "  CLASSNAME         run the class named CLASSNAME"
print "Most commands print help when invoked w/o parameters."
sys.exit(1)
command = sys.argv[1]
#print "COMMAND: " + command
nutch_home = thisdir + "/.."
java_home = os.getenv("NUTCH_JAVA_HOME")
if(java_home != None):
os.setenv("JAVA_HOME", java_home)
print java_home
java_home = os.getenv("JAVA_HOME")
if(java_home == None):
print "Error: JAVA_HOME is not set."
exit(1)
java = java_home + "/bin/java.exe"
java_heap_max = "-Xmx1000m"
nutch_heap_sz = os.getenv("NUTCH_HEAPSIZE")
if(nutch_heap_sz != None):
java_heap_max = "-Xmx"+ nutch_heap_sz +"m"
#print java_heap_max
classpath = nutch_home + "/conf"
classpath = classpath + cpsep + nutch_home + "/lib/tools.jar"
# for developers, add plugins, job & test code to CLASSPATH
if( os.path.exists( nutch_home + "/build/plugins" ) ):
classpath = classpath + cpsep + nutch_home + "/build/plugins"
flist = glob.glob(nutch_home + "/build/nutch-*.job")
for l in flist:
classpath = classpath + cpsep + l
if( os.path.exists( nutch_home + "/build/test/classes" ) ):
classpath = classpath + cpsep + nutch_home + "/build/test/classes"
flist = glob.glob(nutch_home + "/nutch-*.job")
for l in flist:
classpath = classpath + cpsep + l
if( os.path.exists( nutch_home + "/plugins" ) ):
classpath = classpath + cpsep + nutch_home + "/plugins"
flist = glob.glob(nutch_home + "/lib/*.jar")
for l in flist:
classpath = classpath + cpsep + l
flist = glob.glob(nutch_home + "/lib/jetty-ext/*.jar")
for l in flist:
classpath = classpath + cpsep + l
#print classpath
nutch_log_dir = os.getenv("NUTCH_LOG_DIR")
if(nutch_log_dir == None):
nutch_log_dir = nutch_home + "/logs"
nutch_log_file = os.getenv("NUTCH_LOGFILE")
if(nutch_log_file == None):
nutch_log_file = "hadoop.log"
nutch_opts = os.getenv("NUTCH_OPTS")
if( nutch_opts == None ):
nutch_opts = ""
nutch_opts = nutch_opts + " -Dhadoop.log.dir=" + nutch_log_dir
nutch_opts = nutch_opts + " -Dhadoop.log.file=" + nutch_log_file
# figure out which class to run
theclass = command
if ( command == "crawl" ):
theclass="org.apache.nutch.crawl.Crawl"
elif ( command == "inject" ):
theclass="org.apache.nutch.crawl.Injector"
elif ( command == "generate" ):
theclass="org.apache.nutch.crawl.Generator"
elif ( command == "fetch" ):
theclass="org.apache.nutch.fetcher.Fetcher"
elif ( command == "parse" ):
theclass="org.apache.nutch.parse.ParseSegment"
elif ( command == "readdb" ):
theclass="org.apache.nutch.crawl.CrawlDbReader"
elif ( command == "mergedb" ):
theclass="org.apache.nutch.crawl.CrawlDbMerger"
elif ( command == "readlinkdb" ):
theclass="org.apache.nutch.crawl.LinkDbReader"
elif ( command == "segread" ):
theclass="org.apache.nutch.segment.SegmentReader"
elif ( command == "mergesegs" ):
theclass="org.apache.nutch.segment.SegmentMerger"
elif ( command == "updatedb" ):
theclass="org.apache.nutch.crawl.CrawlDb"
elif ( command == "invertlinks" ):
theclass="org.apache.nutch.crawl.LinkDb"
elif ( command == "mergelinkdb" ):
theclass="org.apache.nutch.crawl.LinkDbMerger"
elif ( command == "index" ):
theclass="org.apache.nutch.indexer.Indexer"
elif ( command == "dedup" ):
theclass="org.apache.nutch.indexer.DeleteDuplicates"
elif ( command == "merge" ):
theclass="org.apache.nutch.indexer.IndexMerger"
elif ( command == "plugin" ):
theclass="org.apache.nutch.plugin.PluginRepository"
elif ( command == "server" ):
#what goes in place of the $Server?
theclass="org.apache.nutch.searcher.DistributedSearch$Server"
args = ""
for i in range(2, len(sys.argv)):
args = args + " " + sys.argv[i]
#windows doesnt like this even though there are quotes around it...
#"\"" + java +"\" "
cmdtorun = "java " + java_heap_max + " " + nutch_opts + " -classpath \"" + classpath + "\" " + theclass + args
#print cmdtorun
os.system(cmdtorun)