sudo useradd -d /home/nutch nutchlogin as nutch.
sudo mkdir /home/search
sudo chown -R nutch:nutch /home/search
2. Download the latest stable release of Nutch.
Download the latest stable release from here:
http://lucene.apache.org/nutch/release/
for Example:
wget http://www.meisei-u.ac.jp/mirror/apache/dist/lucene/nutch/nutch-1.0.tar.gz3. Unpack the latest stable release of Nutch.
tar xvfz nutch-1.0.tar.gz4. Edit the nutch-site.xml
mv nutch-1.0 /home/search/nutch
cd /home/search/nutch
vi conf/nutch-site.xmlThe contents are as follow:
5. Edit the crawl-urlfilter.txt<configuration>
<property>
<name>http.agent.name</name>
<value>nutch</value>
<description>HTTP 'User-Agent' request header. </description>
</property>
<property>
<name>http.agent.description</name>
<value>Nutch_Test</value>
<description>Further description</description>
</property>
<property>
<name>http.agent.url</name>
<value>localhost</value>
<description>A URL to advertise in the User-Agent header. </description>
</property>
<property>
<name>http.agent.email</name>
<value>test@test.org.tw</value>
<description>An email address
</description>
</property>
<property>
<name>http.robots.agents</name>
<value>nutch</value>
<description>The agent strings we'll look for in robots.txt files,
comma-separated, in decreasing order of precedence. You should
put the value of http.agent.name as the first agent name, and keep the
default * at the end of the list. E.g.: BlurflDev,Blurfl,*
</description>
</property>
<property>
<name>plugin.folders</name>
<value>/home/search/nutch/plugins</value>
<description>Directories where nutch plugins are located. </description>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
<description> Regular expression naming plugin directory names</description>
</property>
<property>
<name>parse.plugin.file</name>
<value>parse-plugins.xml</value>
<description>The name of the file that defines the associations between
content-types and parsers.</description>
</property>
<property>
<name>db.max.outlinks.per.page</name>
<value>-1</value>
<description> </description>
</property>
<property>
<name>http.content.limit</name>
<value>-1</value>
</property>
<property>
<name>indexer.mergeFactor</name>
<value>500</value>
<description>The factor that determines the frequency of Lucene segment
merges. This must not be less than 2, higher values increase indexing
speed but lead to increased RAM usage, and increase the number of
open file handles (which may lead to "Too many open files" errors).
NOTE: the "segments" here have nothing to do with Nutch segments, they
are a low-level data unit used by Lucene.
</description>
</property>
<property>
<name>indexer.minMergeDocs</name>
<value>500</value>
<description>This number determines the minimum number of Lucene
Documents buffered in memory between Lucene segment merges. Larger
values increase indexing speed and increase RAM usage.
</description>
</property>
</configuration>
vi conf/crawl-urlfilter.txtThe contents are as follow.
# skip ftp:, & mailto: urls6. Create a directory with a flat file of root urls.
-^(ftp|mailto):
# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
# skip URLs containing certain characters as probable queries, etc.
-[*!@]
# accept anything else
+.*
mkdir urls7. Edit hadoop-env.sh
echo "http://www.waseda.ac.jp" >> urls/urls.txt
vi conf/hadoop-env.shAttach the following to the hadoop-env.sh.
export JAVA_HOME=/usr/lib/jvm/java-6-sun8. Export the environment variables.
export HADOOP_HOME=/home/search/nutch
export HADOOP_CONF_DIR=/home/search/nutch/conf
export HADOOP_SLAVES=$HADOOP_CONF_DIR/slaves
export HADOOP_LOG_DIR=/tmp/hadoop/logs
export HADOOP_PID_DIR=/tmp/hadoop/pid
export NUTCH_HOME=/home/search/nutch
export NUTCH_CONF_DIR=/home/search/nutch/conf
source conf/hadoop-env.sh9. Edit the hadoop-site.xml
vi conf/hadoop-site.xmlThe contents are as follow.
10. Startup Hadoop<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000/</value>
<description> </description>
</property>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
<description> </description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/tmp/hadoop/hadoop-${user.name}</value>
<description> </description>
</property>
</configuration>
bin/hadoop namenode -format11. Test the hadoop
bin/start-all.sh
Hadoop Map/Reduce Administration Page12. Add the urls directory to the hadoop filesystem.
bin/hadoop dfs -put urls urls13. Start to Crawl.
bin/nutch crawl urls -dir crawl -depth 3 -topN 10014. Download Tomcat
cd ~15. Unpack Tomcat
wget http://archive.apache.org/dist/tomcat/tomcat-6/v6.0.18/bin/apache-tomcat-6.0.18.tar.gz
tar xvfz apache-tomcat-6.0.18.tar.gz16. Edit Server.xml.
mv ./apache-tomcat-6.0.18 /home/search/tomcat
cd /home/search/tomcatModify the server.xml as following.
vi conf/server.xml
17. Change Nutch search page as tomcat's root page.<Connector port="8080" protocol="HTTP/1.1"
connectionTimeout="20000"
redirectPort="8443" URIEncoding="UTF-8"
useBodyEncodingForURI="true" />
cd /home/search/nutch18. Edit nutch-site.xml in tomcat.
mkdir web
cd web
jar -xvf ../nutch-1.0.war
mv /home/search/tomcat/webapps/ROOT /home/search/tomcat/webapps/ROOT.org
mv /home/search/nutch/web /home/search/tomecat/webapps/ROOT
cd /home/search/tomcat
vi webapps/ROOT/WEB-INF/classes/nutch-site.xml
Modify as following.
19. Download the crawled result to local.<configuration>
<property>
<name>searcher.dir</name>
<value>/home/search/crawl</value>
</property>
</configuration>
cd /home/search/nutch20. Start tomcat
bin/hadoop dfs -get crawl /home/search/crawl
/home/search/tomcat/bin/startup.sh21. Search in the crawled index
http://localhost:8080
Reference:
Nutch 安裝使用
http://wiki.apache.org/nutch/NutchHadoopTutorial
http://wiki.apache.org/nutch/NutchTutorial
No comments:
Post a Comment