Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
.vscode
3 changes: 2 additions & 1 deletion conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<!--<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />-->
<PatternLayout pattern="%d %p %c [%t] %m%n" />
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for this change? Does this print the logger name in full?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does print in full as I am otherwise not entirely clear where a message is coming from.
We can revert the change before merge - for the time being on my side I need it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HiranChaudhuri please revert. Thank you

<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
Expand Down
22 changes: 22 additions & 0 deletions conf/url-authentication.xml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
Comment thread
lewismc marked this conversation as resolved.
<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!-- todo: This file should be some encrypted vault -->
<url-authentication>
<authentication pattern="^smb://host/share.*" user="user" domain="domain" password="password"/>
<authentication pattern="^smb://hiran@nas/Documents.*" user="hiran" domain="domain" password="password"/>
</url-authentication>
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/Injector.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ private String filterNormalize(String url) {
if (filters != null)
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
LOG.warn("Skipping {}", url, e);
url = null;
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
<ant dir="protocol-httpclient" target="deploy"/>
<ant dir="protocol-interactiveselenium" target="deploy" />
<ant dir="protocol-okhttp" target="deploy"/>
<ant dir="protocol-smb" target="deploy"/>
Comment thread
lewismc marked this conversation as resolved.
<ant dir="protocol-selenium" target="deploy" />
<ant dir="publish-rabbitmq" target="deploy"/>
<ant dir="scoring-depth" target="deploy"/>
Expand Down Expand Up @@ -142,6 +143,7 @@
<ant dir="protocol-http" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<ant dir="protocol-okhttp" target="test"/>
<ant dir="protocol-smb" target="test"/>
<ant dir="scoring-orphan" target="test"/>
<ant dir="scoring-metadata" target="test"/>
<ant dir="subcollection" target="test"/>
Expand Down Expand Up @@ -226,6 +228,7 @@
<ant dir="protocol-httpclient" target="clean"/>
<ant dir="protocol-interactiveselenium" target="clean" />
<ant dir="protocol-okhttp" target="clean"/>
<ant dir="protocol-smb" target="clean"/>
<ant dir="protocol-selenium" target="clean" />
<ant dir="publish-rabbitmq" target="clean"/>
<ant dir="scoring-depth" target="clean"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/protocol-smb/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="protocol-smb" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
54 changes: 54 additions & 0 deletions src/plugin/protocol-smb/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd"
xmlns:ns0="http://ant.apache.org/ivy/maven" version="2.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
<dependency org="com.hierynomus" name="smbj" rev="0.13.0"/>
<!--
These dependencies are either contained in smbj (transitive) or
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just remove these comments.

already provided by Nutch itself. Hence they can remain commented out.

<dependency org="net.engio" name="mbassador" rev="1.3.0"/>
Comment thread
lewismc marked this conversation as resolved.
<dependency org="org.bouncycastle" name="bcprov-jdk18on" rev="1.75"/>
<dependency org="com.hierynomus" name="asn-one" rev="0.6.0"/>
<dependency org="commons-io" name="commons-io" rev="2.17.0"/>
Comment thread
HiranChaudhuri marked this conversation as resolved.
-->
<dependency org="com.google.guava" name="guava" rev="33.3.1-jre"/>

</dependencies>

</ivy-module>
53 changes: 53 additions & 0 deletions src/plugin/protocol-smb/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<plugin
id="protocol-smb"
name="SMB Protocol based on https://github.com/hierynomus/smbj"
version="1.0.0"
provider-name="Hiran Chaudhuri">

<runtime>
<library name="asn-one-0.6.0.jar"/>
Comment thread
lewismc marked this conversation as resolved.
<library name="bcprov-jdk18on-1.75.jar"/>
<library name="mbassador-1.3.0.jar"/>
<library name="protocol-smb.jar">
<export name="*"/>
</library>
<library name="smbj-0.13.0.jar"/>

<library name="commons-io-2.17.0.jar"/>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>

<extension id="org.apache.nutch.protocol.smb"
name="SmbProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.smb.Smb"
class="org.apache.nutch.protocol.smb.SmbProtocol">
<parameter name="protocolName" value="smb"/>
<parameter name="urlStreamHandler" value="org.apache.nutch.protocol.smb.SmbHandler"/>
</implementation>

</extension>

</plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.smb;

import java.net.URL;
import java.net.URLConnection;
import java.net.URLStreamHandler;

public class SmbHandler extends URLStreamHandler {

@Override
protected URLConnection openConnection(URL u) {
return new SmbURLConnection(u);
}
}
Loading