public class CrawlConfig extends Object
| Constructor and Description |
|---|
CrawlConfig() |
| Modifier and Type | Method and Description |
|---|---|
int |
getConnectionTimeout() |
String |
getCrawlStorageFolder() |
int |
getMaxConnectionsPerHost() |
int |
getMaxDepthOfCrawling() |
int |
getMaxDownloadSize() |
int |
getMaxOutgoingLinksToFollow() |
int |
getMaxPagesToFetch() |
int |
getMaxTotalConnections() |
int |
getPolitenessDelay() |
String |
getProxyHost() |
String |
getProxyPassword() |
int |
getProxyPort() |
String |
getProxyUsername() |
int |
getSocketTimeout() |
String |
getUserAgentString() |
boolean |
isFollowRedirects() |
boolean |
isIncludeBinaryContentInCrawling() |
boolean |
isIncludeHttpsPages() |
boolean |
isResumableCrawling() |
void |
setConnectionTimeout(int connectionTimeout)
Connection timeout in milliseconds
|
void |
setCrawlStorageFolder(String crawlStorageFolder)
The folder which will be used by crawler for storing the intermediate
crawl data.
|
void |
setFollowRedirects(boolean followRedirects)
Should we follow redirects?
|
void |
setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling)
Should we fetch binary content such as images, audio, ...?
|
void |
setIncludeHttpsPages(boolean includeHttpsPages)
Should we also crawl https pages?
|
void |
setMaxConnectionsPerHost(int maxConnectionsPerHost)
Maximum Connections per host
|
void |
setMaxDepthOfCrawling(int maxDepthOfCrawling)
Maximum depth of crawling For unlimited depth this parameter should be
set to -1
|
void |
setMaxDownloadSize(int maxDownloadSize)
Max allowed size of a page.
|
void |
setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow)
Max number of outgoing links which are processed from a page
|
void |
setMaxPagesToFetch(int maxPagesToFetch)
Maximum number of pages to fetch For unlimited number of pages, this
parameter should be set to -1
|
void |
setMaxTotalConnections(int maxTotalConnections)
Maximum total connections
|
void |
setPolitenessDelay(int politenessDelay)
Politeness delay in milliseconds (delay between sending two requests to
the same host).
|
void |
setProxyHost(String proxyHost)
If crawler should run behind a proxy, this parameter can be used for
specifying the proxy host.
|
void |
setProxyPassword(String proxyPassword)
If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the
password.
|
void |
setProxyPort(int proxyPort)
If crawler should run behind a proxy, this parameter can be used for
specifying the proxy port.
|
void |
setProxyUsername(String proxyUsername)
If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the
username.
|
void |
setResumableCrawling(boolean resumableCrawling)
If this feature is enabled, you would be able to resume a previously
stopped/crashed crawl.
|
void |
setSocketTimeout(int socketTimeout)
Socket timeout in milliseconds
|
void |
setUserAgentString(String userAgentString)
user-agent string that is used for representing your crawler to web
servers.
|
String |
toString() |
void |
validate()
Validates the configs specified by this instance.
|
public void validate()
throws Exception
Exceptionpublic String getCrawlStorageFolder()
public void setCrawlStorageFolder(String crawlStorageFolder)
public boolean isResumableCrawling()
public void setResumableCrawling(boolean resumableCrawling)
public int getMaxDepthOfCrawling()
public void setMaxDepthOfCrawling(int maxDepthOfCrawling)
public int getMaxPagesToFetch()
public void setMaxPagesToFetch(int maxPagesToFetch)
public String getUserAgentString()
public void setUserAgentString(String userAgentString)
public int getPolitenessDelay()
public void setPolitenessDelay(int politenessDelay)
politenessDelay - the delay in milliseconds.public boolean isIncludeHttpsPages()
public void setIncludeHttpsPages(boolean includeHttpsPages)
public boolean isIncludeBinaryContentInCrawling()
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling)
public int getMaxConnectionsPerHost()
public void setMaxConnectionsPerHost(int maxConnectionsPerHost)
public int getMaxTotalConnections()
public void setMaxTotalConnections(int maxTotalConnections)
public int getSocketTimeout()
public void setSocketTimeout(int socketTimeout)
public int getConnectionTimeout()
public void setConnectionTimeout(int connectionTimeout)
public int getMaxOutgoingLinksToFollow()
public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow)
public int getMaxDownloadSize()
public void setMaxDownloadSize(int maxDownloadSize)
public boolean isFollowRedirects()
public void setFollowRedirects(boolean followRedirects)
public String getProxyHost()
public void setProxyHost(String proxyHost)
public int getProxyPort()
public void setProxyPort(int proxyPort)
public String getProxyUsername()
public void setProxyUsername(String proxyUsername)
public String getProxyPassword()
public void setProxyPassword(String proxyPassword)
Copyright © 2013. All Rights Reserved.