Skip to content

Method: withContentReader(Function)

1: /*
2: * *********************************************************************************************************************
3: *
4: * blueMarine II: Semantic Media Centre
5: * http://tidalwave.it/projects/bluemarine2
6: *
7: * Copyright (C) 2015 - 2021 by Tidalwave s.a.s. (http://tidalwave.it)
8: *
9: * *********************************************************************************************************************
10: *
11: * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
12: * the License. You may obtain a copy of the License at
13: *
14: * http://www.apache.org/licenses/LICENSE-2.0
15: *
16: * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
17: * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
18: * specific language governing permissions and limitations under the License.
19: *
20: * *********************************************************************************************************************
21: *
22: * git clone https://bitbucket.org/tidalwave/bluemarine2-src
23: * git clone https://github.com/tidalwave-it/bluemarine2-src
24: *
25: * *********************************************************************************************************************
26: */
27: package it.tidalwave.bluemarine2.mediascanner.impl.tika;
28:
29: import javax.annotation.Nonnull;
30: import javax.annotation.concurrent.Immutable;
31: import java.util.Map;
32: import java.util.function.Function;
33: import java.util.function.Supplier;
34: import java.io.ByteArrayInputStream;
35: import java.io.IOException;
36: import java.io.InputStream;
37: import java.nio.file.Files;
38: import java.nio.file.Path;
39: import org.apache.tika.exception.TikaException;
40: import org.apache.tika.metadata.Metadata;
41: import org.apache.tika.parser.AutoDetectParser;
42: import org.apache.tika.parser.ParseContext;
43: import org.apache.tika.parser.Parser;
44: import org.apache.tika.parser.image.ImageMetadataExtractor;
45: import org.apache.tika.parser.xmp.JempboxExtractor;
46: import org.xml.sax.SAXException;
47: import org.xml.sax.helpers.DefaultHandler;
48: import lombok.Getter;
49: import lombok.RequiredArgsConstructor;
50: import lombok.ToString;
51: import lombok.With;
52: import lombok.extern.slf4j.Slf4j;
53: import static java.nio.charset.StandardCharsets.UTF_8;
54: import static it.tidalwave.util.FunctionalCheckedExceptionWrappers.*;
55:
56: /***********************************************************************************************************************
57: *
58: * @author Fabrizio Giudici
59: *
60: **********************************************************************************************************************/
61: @Slf4j
62: public class TikaMetadataLoader
63: {
64: /*******************************************************************************************************************
65: *
66: * A default content reader that just reads data in the simplest way.
67: *
68: ******************************************************************************************************************/
69: public static final Function<Path, byte[]> DEFAULT_CONTENT_READER = _f(Files::readAllBytes);
70:
71: /*******************************************************************************************************************
72: *
73: * An experimental reader that wraps contents in a xpacket section - it can be used to try parser that extract
74: * embedded XMP, such as Jempbox.
75: *
76: ******************************************************************************************************************/
77: public static final Function<Path, byte[]> EXP_XMP_PACKET_WRAPPER_CONTENT_READER = path ->
78: {
79: try
80: {
81: return ("<?xpacket begin=\"\ufeff\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?>"
82: + Files.readString(path, UTF_8)
83: + "<?xpacket end=\"w\"?>").getBytes(UTF_8);
84: }
85: // FIXME: handle RuntimeException, IOException - make FunctionalCheckedExceptionWrappers's method public
86: catch (Exception e)
87: {
88: throw new RuntimeException(e);
89: }
90: };
91:
92: /*******************************************************************************************************************
93: *
94: * A strategy to extract metadata.
95: *
96: ******************************************************************************************************************/
97: public static interface MetadataExtractor
98: {
99: /***************************************************************************************************************
100: *
101: * Extracts metadata.
102: *
103: * @param path the path of the file to extract metadata from
104: * @param bytes the contents of the file
105: * @param config the configuration of the extraction
106: * @param metadata the object where to store metadata to
107: * @throws TikaException in case of error
108: * @throws IOException in case of error
109: * @throws SAXException in case of error
110: *
111: **************************************************************************************************************/
112: public void extractMetadata (@Nonnull final Path path,
113: @Nonnull final byte[] bytes,
114: @Nonnull final Config config,
115: @Nonnull final Metadata metadata)
116: throws TikaException, IOException, SAXException;
117: }
118:
119: /*******************************************************************************************************************
120: *
121: * This seems to be the standard way to use Tika with all defaults.
122: *
123: ******************************************************************************************************************/
124: public static final MetadataExtractor DEFAULT_METADATA_EXTRACTOR = (path, bytes, params, metadata) ->
125: {
126: metadata.set(Metadata.CONTENT_TYPE, Files.probeContentType(path));
127: final Parser parser = params.parserSupplier.get();
128: final DefaultHandler handler = params.handlerSupplier.get();
129: final ParseContext context = params.parseContextSupplier.get();
130: parser.parse(new ByteArrayInputStream(bytes), handler, metadata, context);
131: };
132:
133: /*******************************************************************************************************************
134: *
135: * An experimental alternate way to extract data from XMP sidecar files.
136: *
137: ******************************************************************************************************************/
138: public static final MetadataExtractor EXP_XMP_METADATA_EXTRACTOR = (path, bytes, params, metadata) ->
139: {
140: // metadata.set(Metadata.CONTENT_TYPE, "application/rdf+xml");
141: metadata.set(Metadata.CONTENT_TYPE, "application/xml");
142: final ImageMetadataExtractor ime = new ImageMetadataExtractor(metadata);
143: ime.parseRawXMP(bytes);
144: };
145:
146: /*******************************************************************************************************************
147: *
148: * An experimental alternate way to extract data from XMP sidecar files.
149: *
150: ******************************************************************************************************************/
151: public static final MetadataExtractor EXP_XMP_METADATA_EXTRACTOR_JEMPBOX = (path, bytes, params, metadata) ->
152: {
153: try (final InputStream is = new ByteArrayInputStream(bytes))
154: {
155: new JempboxExtractor(metadata).parse(is);
156: }
157: };
158:
159: /*******************************************************************************************************************
160: *
161: * The configuration for the Tika metadata loader.
162: *
163: ******************************************************************************************************************/
164: @RequiredArgsConstructor @Getter @ToString @Immutable
165: public static class Config
166: {
167: /** The default configuration. */
168: public static final Config DEFAULT = new Config(TikaMetadataLoader.DEFAULT_CONTENT_READER,
169: AutoDetectParser::new,
170: DefaultHandler::new,
171: ParseContext::new,
172: TikaMetadataLoader.DEFAULT_METADATA_EXTRACTOR);
173:
174: /** The configuration for XMP sidecar files. */
175: public static final Config XMP_SIDECAR = new Config(TikaMetadataLoader.DEFAULT_CONTENT_READER,
176: XmpParser::new,
177: DefaultHandler::new,
178: ParseContext::new,
179: TikaMetadataLoader.DEFAULT_METADATA_EXTRACTOR);
180:
181: /** A map of default config overrides. */
182: public static final Map<ExtensionAndMimeType, Config> DEFAULT_ASSOCIATIONS =
183: Map.of(ExtensionAndMimeType.ofExtension("xmp"), XMP_SIDECAR);
184:
185: /** The object that loads file contents. */
186:• @Nonnull @With
187: private final Function<Path, byte[]> contentReader;
188:
189: /** The supplier of a parser. */
190: @Nonnull @With
191: private final Supplier<Parser> parserSupplier;
192:
193: /** The SAX handler. */
194: @Nonnull @With
195: public Supplier<DefaultHandler> handlerSupplier;
196:
197: /** The parse context. */
198: @Nonnull @With
199: public Supplier<ParseContext> parseContextSupplier;
200:
201: /** The metadata extractor. */
202: @Nonnull @With
203: private final MetadataExtractor metadataExtractor;
204: }
205:
206: /*******************************************************************************************************************
207: *
208: * Loads metadata from a given file.
209: *
210: * @param path the path of the file to extract metadata from
211: * @param config the Tika configuration
212: * @return the metadata
213: * @throws TikaException in case of error
214: * @throws IOException in case of error
215: * @throws SAXException in case of error
216: *
217: ******************************************************************************************************************/
218: @Nonnull
219: public MetadataWithPath loadMetadata (@Nonnull final Path path, @Nonnull final Config config)
220: throws TikaException, IOException, SAXException
221: {
222: log.info("======== {}", path);
223: final byte[] bytes = config.contentReader.apply(path);
224: final Metadata metadata = new Metadata();
225: config.metadataExtractor.extractMetadata(path, bytes, config, metadata);
226:
227: return new MetadataWithPath(path, metadata);
228: }
229: }