diff --git a/README.md b/README.md index eb49c562f..ee4e7c37c 100644 --- a/README.md +++ b/README.md @@ -68,24 +68,25 @@ That's already it! If you run this code, it should print out something like this: ```console -Fundus-Article: +Fundus-Article including 1 image(s): - Title: "Feinstein's Return Not Enough for Confirmation of Controversial New [...]" -- Text: "Democrats jammed three of President Joe Biden's controversial court nominees - through committee votes on Thursday thanks to a last-minute [...]" +- Text: "89-year-old California senator arrived hour late to Judiciary Committee hearing + to advance President Biden's stalled nominations Democrats [...]" - URL: https://freebeacon.com/politics/feinsteins-return-not-enough-for-confirmation-of-controversial-new-hampshire-judicial-nominee/ -- From: FreeBeacon (2023-05-11 18:41) +- From: The Washington Free Beacon (2023-05-11 18:41) -Fundus-Article: +Fundus-Article including 3 image(s): - Title: "Northwestern student government freezes College Republicans funding over [...]" - Text: "Student government at Northwestern University in Illinois "indefinitely" froze the funds of the university's chapter of College Republicans [...]" - URL: https://www.foxnews.com/us/northwestern-student-government-freezes-college-republicans-funding-poster-critical-lgbtq-community -- From: FoxNews (2023-05-09 14:37) +- From: Fox News (2023-05-09 14:37) ``` This printout tells you that you successfully crawled two articles! For each article, the printout details: +- the number of images included in the article - the "Title" of the article, i.e. its headline - the "Text", i.e. the main article body text - the "URL" from which it was crawled @@ -146,6 +147,57 @@ for article in crawler.crawl(max_articles=1000000): ```` +## Example 4: Crawl some images + +By default, Fundus tries to parse the images included in every crawled article. +Let's crawl an article and print out the images for some more details. + +```python +from fundus import PublisherCollection, Crawler + +# initialize the crawler for The LA Times +crawler = Crawler(PublisherCollection.us.LATimes) + +# crawl 1 article and print the images +for article in crawler.crawl(max_articles=1): + for image in article.images: + print(image) +``` + +For [this article](https://www.latimes.com/sports/lakers/story/2024-12-13/lakers-lebron-james-away-from-team-timberwolves) you will get the following output: + +```console +Fundus-Article Cover-Image: +-URL: 'https://ca-times.brightspotcdn.com/dims4/default/41c9bc4/2147483647/strip/true/crop/4598x3065+0+0/resize/1200x800!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2F77%2Feb%2F7fed2d3942fd97b0f7325e7060cf%2Flakers-timberwolves-basketball-33765.jpg' +-Description: 'Minnesota Timberwolves forward Julius Randle (30) works toward the basket.' +-Caption: 'Minnesota Timberwolves forward Julius Randle, left, controls the ball in front of Lakers forward Anthony Davis during the first half of the Lakers’ 97-87 loss Friday.' +-Authors: ['Abbie Parr / Associated Press'] +-Versions: [320x213, 568x379, 768x512, 1024x683, 1200x800] + +Fundus-Article Image: +-URL: 'https://ca-times.brightspotcdn.com/dims4/default/9a22715/2147483647/strip/true/crop/4706x3137+0+0/resize/1200x800!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2Ff7%2F52%2Fdcd6b263480ab579ac583a4fdbbf%2Flakers-timberwolves-basketball-48004.jpg' +-Description: 'Lakers coach JJ Redick talks with forward Anthony Davis during a loss to the Timberwolves.' +-Caption: 'Lakers coach JJ Redick, right, talks with forward Anthony Davis during the first half of a 97-87 loss to the Timberwolves on Friday night.' +-Authors: ['Abbie Parr / Associated Press'] +-Versions: [320x213, 568x379, 768x512, 1024x683, 1200x800] + +Fundus-Article Image: +-URL: 'https://ca-times.brightspotcdn.com/dims4/default/580bae4/2147483647/strip/true/crop/5093x3470+0+0/resize/1200x818!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2F3b%2Fdf%2F64c0198b4c2fb2b5824aaccb64b7%2F1486148-sp-nba-lakers-trailblazers-25-gmf.jpg' +-Description: 'Lakers star LeBron James sits in street clothes on the bench next to his son, Bronny James.' +-Caption: 'Lakers star LeBron James sits in street clothes on the bench next to his son, Bronny James, during a win over Portland at Crypto.com Arena on Dec. 8.' +-Authors: ['Gina Ferazzi / Los Angeles Times'] +-Versions: [320x218, 568x387, 768x524, 1024x698, 1200x818] +``` + +For each image, the printout details: +- The cover image designation (if applicable). +- The URL for the highest-resolution version of the image. +- A description of the image. +- The image's caption. +- The name of the copyright holder. +- A list of all available versions of the image. + + ## Tutorials We provide **quick tutorials** to get you started with the library: diff --git a/docs/3_the_article_class.md b/docs/3_the_article_class.md index 1ee03e9d7..dc5884e68 100644 --- a/docs/3_the_article_class.md +++ b/docs/3_the_article_class.md @@ -4,6 +4,7 @@ * [What is an `Article`](#what-is-an-article) * [The articles' body](#the-articles-body) * [HTML](#html) + * [Images](#images) * [Language detection](#language-detection) * [Saving an Article](#saving-an-article) @@ -117,6 +118,22 @@ Here you have access to the following information: 4. `crawl_date: datetime`: The exact timestamp the article was crawled. 5. `source_info: SourceInfo`: Some information about the HTML's origins, mostly for debugging purpose. +## Images + +Some publishers provide images with their articles. +To encompass all necessary information, the articles `images` attribute returns a list of custom `Image` objects. +Each `Image` object contains the following attributes: +- `url`: the URL of the image with the largest dimensions. +- `versions`: a list of custom `ImageVersion` objects, each containing the following attributes: + - `url`: the URL of the image with the specific dimensions. + - `size`: a `Dimension` object with attributes `width` and `height`. + - `type`: the image format (e.g. `jpeg`, `png`). +- `is_cover`: a boolean indicating whether the image is the cover image of the article. +- `description`: a string describing the image (usually the alt-text). +- `caption`: the image caption as used in the article. +- `authors`: a list of strings representing the authors of the image. +- `position`: an integer describing the position of the image in the DOM-tree. + ## Language detection Sometimes publishers support articles in different languages. diff --git a/docs/attribute_guidelines.md b/docs/attribute_guidelines.md index 69000aad4..ded38695f 100644 --- a/docs/attribute_guidelines.md +++ b/docs/attribute_guidelines.md @@ -66,4 +66,11 @@ Those attributes will be validated with unit tests when used. bool + + images + A list of `Images` - Fundus own datatype for image representation - included within the article. + The `Images` include metadata like caption, authors, and position if available. + List[Image] + image_extraction + diff --git a/docs/how_to_add_a_publisher.md b/docs/how_to_add_a_publisher.md index c011c36d8..f0205a5a3 100644 --- a/docs/how_to_add_a_publisher.md +++ b/docs/how_to_add_a_publisher.md @@ -17,7 +17,8 @@ * [Working with `lxml`](#working-with-lxml) * [CSS-Select](#css-select) * [XPath](#xpath) - * [Extract the ArticleBody](#extract-the-articlebody) + * [Extracting the ArticleBody](#extracting-the-articlebody) + * [Extracting the Images](#extracting-the-images) * [Checking the free_access attribute](#checking-the-free_access-attribute) * [Finishing the Parser](#finishing-the-parser) * [6. Generate unit tests and update tables](#6-generate-unit-tests-and-update-tables) @@ -533,7 +534,7 @@ Instead, we recommend referring to [this](https://devhints.io/xpath) documentati Make sure to examine other parsers and consult the [attribute guidelines](attribute_guidelines.md) for specifics on attribute implementation. We strongly encourage utilizing these utility functions, especially when parsing the `ArticleBody`. -### Extract the ArticleBody +### Extracting the ArticleBody In the context of Fundus, an article's body typically includes multiple paragraphs, and optionally, a summary and several subheadings. It's important to note that article layouts can vary significantly between publishers, with the most common layouts being: @@ -546,6 +547,39 @@ To accurately extract the body of an article, use the `extract_article_body_with This function accepts selectors for the different body parts as input and returns a parsed `ArticleBody`. For practical examples, refer to existing parser implementations to understand how everything integrates. +### Extracting the images + +Fundus offers a utility function `image_extraction` to extract images from the article. +This function only requires the `doc` element of the article and the `_paragraph_selector` of the parser with further optional attributes that can be used if necessary. +The skeleton of the function looks like this: + +```python +from fundus.parser.utility import image_extraction +from fundus.parser import Image + +@attribute +def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + ) +``` + +Once you have implemented this, you can try to extract your first images from the article body! +What can happen now, is that you get an IndexError. +This is caused by the `upper_boundary_selector` not selecting an element. +You have to adjust it to select an element above the cover image, all images that lie before this upper boundary are discarded. +Once you get your first images, you can further fine-tune your results: + +- `image_selector`: This selector is used to filter which image elements are selected. +- `lower_boundary_selector`: By default, all images after the last paragraph are discarded. With this selector, you can define your custom boundary. +- `caption_selector`: This selector is used to extract the caption of the image and should usually be of the form `XPath("./ancestor::...")` +- `alt_selector`: This selector selects the alt text (description) of the image. +- `author_selector`: You have two options, when selecting the author of the image: + - Preferably, the credits are within their own HTML element and can be directly addressed using a XPath selector. + - Alternatively, a `re.Pattern` object can be passed to select the authors from the caption. In this case, a selection group named `credits` is saved as the author, while the entire `Match` will be removed from the caption. +- `relative_urls`: If set, an attempt will be made to complete relative URLs. +- `size_pattern`: A `re.Pattern` object that can be used to extract the image sizes. ### Checking the free_access attribute diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 19e85b65d..fb59fbc52 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -393,7 +393,9 @@ www.dw.com -   + + images +   @@ -1665,7 +1667,9 @@ www.cnbc.com -   + + images + key_points @@ -1716,7 +1720,9 @@ occupydemocrats.com -   + + images + description @@ -1735,7 +1741,9 @@ www.reuters.com -   + + images +   @@ -1865,6 +1873,7 @@ + images topics   @@ -1899,6 +1908,7 @@ + images topics   diff --git a/scripts/generate_parser_test_files.py b/scripts/generate_parser_test_files.py index 859f88552..daf49e187 100644 --- a/scripts/generate_parser_test_files.py +++ b/scripts/generate_parser_test_files.py @@ -143,7 +143,11 @@ def main() -> None: test_data[type(versioned_parser).__name__] = new else: entry.update(new) - test_data[type(versioned_parser).__name__] = dict(sorted(entry.items())) + + # sort entries + test_data[type(versioned_parser).__name__] = dict( + sorted(test_data[type(versioned_parser).__name__].items()) + ) test_data_file.write(test_data) bar.update() diff --git a/src/fundus/parser/__init__.py b/src/fundus/parser/__init__.py index 2e8622df8..7ea710953 100644 --- a/src/fundus/parser/__init__.py +++ b/src/fundus/parser/__init__.py @@ -1,4 +1,4 @@ from .base_parser import BaseParser, ParserProxy, attribute, function -from .data import ArticleBody +from .data import ArticleBody, Image -__all__ = ["ParserProxy", "BaseParser", "attribute", "function", "ArticleBody"] +__all__ = ["ParserProxy", "BaseParser", "attribute", "function", "ArticleBody", "Image"] diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py index fd8ea688f..96a44f898 100644 --- a/src/fundus/parser/data.py +++ b/src/fundus/parser/data.py @@ -1,9 +1,13 @@ +from __future__ import annotations + import re from abc import ABC, abstractmethod from dataclasses import dataclass, fields +from functools import total_ordering from itertools import chain from typing import ( Any, + ClassVar, Collection, Dict, Iterable, @@ -17,15 +21,22 @@ Union, overload, ) +from urllib.parse import urljoin, urlparse import lxml.etree +import lxml.html import more_itertools +import validators import xmltodict from dict2xml import dict2xml from lxml.etree import XPath, fromstring, tostring from typing_extensions import Self, TypeAlias, deprecated -from fundus.utils.serialization import JSONVal, replace_keys_in_nested_dict +from fundus.utils.serialization import ( + DataclassSerializationMixin, + JSONVal, + replace_keys_in_nested_dict, +) LDMappingValue: TypeAlias = Union[List[Dict[str, Any]], Dict[str, Any]] @@ -400,3 +411,165 @@ def deserialize(cls, serialized: Dict[str, Any]) -> Self: def __bool__(self): return any(bool(section) for section in self.sections) + + +@total_ordering +@dataclass +class Dimension(DataclassSerializationMixin): + width: int + height: int + + def __mul__(self, other: Union[float, int]) -> "Dimension": + if isinstance(other, int): + return Dimension(self.width * other, self.height * other) + elif isinstance(other, float): + return Dimension(round(self.width * other), round(self.height * other)) + else: + raise NotImplementedError( + f"'*' is not defined between {type(self).__name__!r} and {type(other).__name__!r}" + ) + + def __rmul__(self, other: Union[float, int]) -> "Dimension": + return self.__mul__(other) + + def __repr__(self) -> str: + return f"{self.width}x{self.height or '...'}" + + def __lt__(self, other: "Dimension") -> bool: + if isinstance(other, Dimension): + if self.width != other.width: + return self.width < other.width + else: + return self.height < other.height + raise NotImplementedError(f"'<' is not defined between {type(self).__name__!r} and {type(other).__name__!r}") + + def __hash__(self) -> int: + return hash((self.width, self.height)) + + @classmethod + def from_ratio( + cls, + width: Optional[float] = None, + height: Optional[float] = None, + ratio: Optional[float] = None, + ) -> Optional["Dimension"]: + if width and height: + return cls(round(width), round(height)) + elif width is not None: + return cls(round(width), round((width / ratio) if ratio else 0)) + elif height is not None: + return cls(round((height * ratio) if ratio else 0), round(height)) + else: + return None + + +def remove_query_parameters_from_url(url: str) -> str: + if any(parameter_indicator in url for parameter_indicator in ("?", "#")): + return urljoin(url, urlparse(url).path) + return url + + +@total_ordering +@dataclass +class ImageVersion(DataclassSerializationMixin): + __FILE_FORMATS__: ClassVar[List[str]] = ["png", "jpg", "jpeg", "webp"] + + url: str + query_width: Optional[str] = None + size: Optional[Dimension] = None + type: Optional[str] = None + + def __post_init__(self): + if not self.type: + url_without_query = remove_query_parameters_from_url(self.url) + self.type = self._parse_type(url_without_query) + + def _parse_type(self, url: str) -> Optional[str]: + if (file_format := url.split(".")[-1]) in self.__FILE_FORMATS__: + if file_format == "jpg": + file_format = "jpeg" + return "image/" + file_format + return None + + def __repr__(self) -> str: + if self.size is not None: + meta = f"{self.size!r}" + elif self.query_width is not None: + meta = f"min-width: {self.query_width}px" + else: + meta = f"{type(self).__name__}" + + return f"{meta}; {self.type}" + + def __hash__(self) -> int: + return hash(self.url) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ImageVersion): + return NotImplemented + return self.url == other.url + + def __lt__(self, other: "ImageVersion") -> bool: + if isinstance(other, ImageVersion): + if self.size and other.size and self.size != other.size: + return self.size < other.size + if (self.size is None) != (other.size is None): + return self.size is None + + if self.query_width and other.query_width and self.query_width != other.query_width: + return self.query_width < other.query_width + if (self.query_width is None) != (other.query_width is None): + return self.query_width is None + + if self.type and other.type and self.type != other.type: + return self.type < other.type + if (self.type is None) != (other.type is None): + return self.type is None + + return self.url < other.url + raise NotImplementedError(f"'<' is not defined between {type(self).__name__!r} and {type(other).__name__!r}") + + +@dataclass(frozen=False) +class Image(DataclassSerializationMixin): + versions: List[ImageVersion] + is_cover: bool + description: Optional[str] + caption: Optional[str] + authors: List[str] + position: int + + def __post_init__(self): + for url in [version.url for version in self.versions]: + if not validators.url(url, strict_query=False): + raise ValueError(f"url {url} is not a valid URL") + + @property + def url(self) -> str: + return self.versions[-1].url + + def __str__(self) -> str: + if self.is_cover: + representation = "Fundus-Article Cover-Image:\n" + else: + representation = "Fundus-Article Image:\n" + representation += ( + f"-URL:\t\t\t {self.url!r}\n" + f"-Description:\t {self.description!r}\n" + f"-Caption:\t\t {self.caption!r}\n" + f"-Authors:\t\t {self.authors}\n" + f"-Versions:\t\t {sorted(set(v.size for v in self.versions if v.size is not None))}\n" + ) + return representation + + def __repr__(self) -> str: + return self.url + + +class DOM: + def __init__(self, root: lxml.html.HtmlElement): + self.root = root + self._depth_first_index = {element: i for i, element in enumerate(root.iter())} + + def get_index(self, node: lxml.html.HtmlElement) -> int: + return self._depth_first_index[node] diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 321252c1a..3f77eabf0 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import itertools import json import re @@ -11,28 +13,39 @@ ClassVar, Dict, Iterable, + Iterator, List, Match, + NamedTuple, Optional, Pattern, + Sequence, + Set, Type, Union, cast, ) +from urllib.parse import urljoin import lxml.html import more_itertools +import validators from dateutil import parser from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.logging import create_logger from fundus.parser.data import ( + DOM, ArticleBody, ArticleSection, + Dimension, + Image, + ImageVersion, LinkedDataMapping, TextSequence, ) +from fundus.utils.regex import _get_match_dict from fundus.utils.serialization import JSONVal logger = create_logger(__name__) @@ -264,13 +277,26 @@ def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str = return join_on.join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip() -def generic_nodes_to_text(nodes: List[lxml.html.HtmlElement], normalize: bool = False) -> List[str]: +def generic_nodes_to_text(nodes: Sequence[Union[lxml.html.HtmlElement, str]], normalize: bool = False) -> List[str]: if not nodes: return [] - texts = [ - normalize_whitespace(str(node.text_content())) if normalize else str(node.text_content()) for node in nodes - ] - return [text for text in texts if text] + + texts = [] + for node in nodes: + if isinstance(node, lxml.html.HtmlElement): + text = str(node.text_content()) + elif isinstance(node, str): + text = node + else: + raise TypeError(f"Unexpected type {type(node)}") + + if normalize: + text = normalize_whitespace(text) + + if text: + texts.append(text) + + return texts def apply_substitution_pattern_over_list( @@ -298,20 +324,21 @@ def generic_author_parsing( value (list[str]): value\n value (list[dict]): [dict["name"] for dict in list if dict["name"]] \n - with common delimiters := [",", ";", " und ", " and ", " & "] + with common delimiters := [",", ";", " und ", " and ", " & ", " | "] All values are stripped with default strip() method before returned. Args: - value: An input value representing author(s) which get parsed based on type - split_on: Only relevant for type() = str. If set, split on , - else (default) split on common delimiters + value: An input value representing author(s) which get parsed based on type. + split_on: Only relevant for type() = str. If set, split on , + else (default) split on common delimiters. + normalize: If True, normalize every author with normalize_whitespace(). Defaults to True Returns: A parsed and striped list of authors """ - common_delimiters = [",", ";", " und ", " and ", " & "] + common_delimiters = [",", ";", " und ", " and ", " & ", r" \| "] parameter_type_error: TypeError = TypeError( f" '{value}' has an unsupported type {type(value)}. " @@ -394,3 +421,366 @@ def parse_title_from_root(root: lxml.html.HtmlElement) -> Optional[str]: return None return strip_nodes_to_text(title_node) + + +def preprocess_url(url: str, domain: str) -> str: + url = re.sub(r"\\/", "/", url) + # Some publishers use relative URLs + if not validators.url(url): + publisher_domain = "https://" + domain + url = urljoin(publisher_domain, url) + return url + + +def image_author_parsing(authors: Union[str, List[str]]) -> List[str]: + credit_keywords = [ + "credits?", + "quellen?", + "bild(rechte)?", + "sources?", + r"(((f|ph)oto(graph)?s?|image|illustrations?|cartoons?|pictures?)\s*)+(by|:|courtesy)", + "©", + "– alle rechte vorbehalten", + "copyright", + "all rights reserved", + "courtesy of", + "=", + ] + author_filter = re.compile(r"(?is)^(" + r"|".join(credit_keywords) + r"):?\s*") + + def clean(author: str): + author = re.sub(r"^\((.*)\)$", r"\1", author).strip() + # filtering credit keywords + author = re.sub(author_filter, "", author, count=1) + # filtering bloat follwing the author + author = re.sub(r"(?i)/?copyright.*", "", author) + return author.strip() + + if isinstance(authors, list): + authors = [clean(author) for author in authors] + else: + authors = clean(authors) + return generic_author_parsing(authors) + + +# https://regex101.com/r/MplUXL/2 +_srcset_pattern = re.compile(r"(?P\S+)\s*(?P[0-9.]+[wx])?(,?\s*)") + + +def parse_srcset(srcset: str) -> Dict[str, str]: + # Updated regular expression to account for query parameters in URLs + urls = {} + for match in _srcset_pattern.finditer(srcset.strip()): + url = match.group("url") + descriptor = match.group("descriptor") # Width (w) or pixel density (x) + urls[descriptor or "1x"] = url + # return sorted dict based on int value of descriptor + return dict(sorted(urls.items(), key=lambda item: float(item[0][:-1]))) + + +# that's the same as string(./attribute::*[ends-with(name(), '*')]) but LXML doesn't support the ends-with function +# these two selectors select the value of the first attribute found ending with src/srcset relative to the node +# as truing value +_srcset_selector = XPath( + "./@*[substring(name(), string-length(name()) - string-length('srcset') + 1) = 'srcset'][starts-with(., 'http') or starts-with(., '/')]" +) +_src_selector = XPath( + "./@*[substring(name(), string-length(name()) - string-length('src') + 1) = 'src'][starts-with(., 'http') or starts-with(., '/')]" +) + + +def parse_urls(node: lxml.html.HtmlElement) -> Optional[Dict[str, str]]: + def get_longest_string(strings: List[str]) -> str: + return sorted(strings, key=len)[-1] + + if srcset := cast(List[str], _srcset_selector(node)): + return parse_srcset(normalize_whitespace(get_longest_string(srcset))) + elif src := cast(List[str], _src_selector(node)): + return {"1x": normalize_whitespace(get_longest_string(src))} + else: + return None + + +class _DimensionCalculator: + def __init__( + self, width: Optional[float] = None, height: Optional[float] = None, ratio: Optional[float] = None + ) -> None: + self.width = width + self.height = height + self.ratio = ratio + + def calculate( + self, width: Optional[float] = None, height: Optional[float] = None, dpr: Optional[float] = None + ) -> Optional[Dimension]: + if not (width or height): + width = self.width + height = self.height + if dimension := Dimension.from_ratio(width, height, self.ratio): + return dimension * (dpr or 1) + return None + + +_media_param_pattern = re.compile(r"\(\s*(?P[\w-]+)\s*:\s*(?P[\d./]+)(?P[a-z]*)\)") +_width_x_height_pattern = re.compile(r"(?P[0-9]+)x(?P[0-9]+)") + + +def get_versions_from_node( + source: lxml.html.HtmlElement, ratio: Optional[float], size_pattern: Optional[Pattern[str]] +) -> Set[ImageVersion]: + if not (urls := parse_urls(source)): + return set() + + # get min/max width + query_width = None + for param, value, descriptor in re.findall(_media_param_pattern, source.get("media", "").split(",")[0]): + if param in ["min-width", "max-width"]: + if descriptor != "px": + logger.debug(f"Pixel calculation not implemented for {descriptor}") + else: + # with the assumption that there is only one max/min width per ',' seperated query and only + # either min- or max-width + query_width = f"{param}:{value}" + + # get width, height and init calculator + if (src_width := source.get("width")) and src_width.replace(".", "", 1).isdigit(): + width = float(src_width or 0) or None + else: + width = None + if (src_height := source.get("height")) and src_height.replace(".", "", 1).isdigit(): + height = float(src_height or 0) or None + else: + height = None + if width and height: + ratio = width / height + calculator = _DimensionCalculator(width, height, ratio) + + versions = set() + for descriptor, url in urls.items(): + kwargs: Dict[str, float] = {} + if descriptor is not None: + if match := re.search(r"(?P[0-9.]+)x", descriptor): + kwargs["dpr"] = float(match.group("multiplier")) + elif match := re.search(r"(?P[0-9]+)(px|w)", descriptor): + kwargs["width"] = float(match.group("width")) + + if size_pattern is not None and ( + match_dict := _get_match_dict(size_pattern, url, conversion=lambda x: float(x)) + ): + kwargs.update(match_dict) + elif not (calculator.width or kwargs.get("width")) and (match := re.search(_width_x_height_pattern, url)): + kwargs.update({k: float(v) for k, v in match.groupdict().items() if v is not None}) + + version = ImageVersion( + url=url, query_width=query_width, size=calculator.calculate(**kwargs), type=source.get("type") + ) + versions.add(version) + + return versions + + +_relative_source_selector = XPath("./ancestor::picture//source") + + +def parse_versions(img_node: lxml.html.HtmlElement, size_pattern: Optional[Pattern[str]] = None) -> List[ImageVersion]: + # parse img + if (default_width := img_node.get("width")) and (default_height := img_node.get("height")): + ratio = float(default_width) / float(default_height) + else: + ratio = None + + versions = set() + for source in _relative_source_selector(img_node) + [img_node]: + for version in get_versions_from_node(source, ratio, size_pattern): + versions.add(version) + + return sorted(versions) + + +class IndexedImageNode(NamedTuple): + position: int + content: lxml.html.HtmlElement + is_cover: bool + + +def parse_image_nodes( + image_nodes: List[IndexedImageNode], + caption_selector: XPath, + alt_selector: XPath, + author_selector: Union[XPath, Pattern[str]], + domain: Optional[str] = None, + size_pattern: Optional[Pattern[str]] = None, +) -> Iterator[Image]: + """Extract urls, caption, description and authors from a list of nodes + + Args: + image_nodes: Indexed nodes to parse. + caption_selector: Selector selecting the caption of an image. Defaults to selecting the figcaption element. + alt_selector: Selector selecting the descriptive text of an image. Defaults to selecting alt value. + author_selector: Selector selecting the credits for an image. Defaults to selecting an arbitrary child of + figure with copyright or credit in its class attribute. + domain: If set, the domain will be prepended to URLs in case they are relative + size_pattern: Regular expression to select , and from the image URL. The given regExp + will be matched with re.findall and overwrites existing values. Defaults to None. + + Returns: + List of Images + """ + + def nodes_to_text(nodes: List[Union[lxml.html.HtmlElement, str]]) -> Optional[str]: + return " ".join(generic_nodes_to_text(nodes, normalize=True)) or None + + for position, node, is_cover in image_nodes: + # parse URLs + if not (versions := parse_versions(node, size_pattern)): + continue + + # resolve relative URLs if domain is given + if domain is not None: + for version in versions: + version.url = urljoin(domain, version.url) + + # parse caption + caption = nodes_to_text(caption_selector(node)) + + # parse description + description = nodes_to_text(alt_selector(node)) + + # parse authors + authors = [] + if isinstance(author_selector, Pattern): + # author is part of the caption + if caption and (match := re.search(author_selector, caption)): + authors = [match.group("credits")] + caption = re.sub(author_selector, "", caption).strip() or None + elif description and (match := re.search(author_selector, description)): + authors = [match.group("credits")] + else: + # author is selectable as node + if author_nodes := author_selector(node): + authors = generic_nodes_to_text(author_nodes, normalize=True) + authors = image_author_parsing(authors) + + yield Image( + versions=versions, + caption=caption, + authors=authors, + description=description, + is_cover=is_cover, + position=position, + ) + + +class Bounds(NamedTuple): + upper: int + first_paragraph: Optional[int] + lower: int + + +def determine_bounds( + dom: DOM, paragraph_selector: XPath, upper_boundary_selector: XPath, lower_boundary_selector: Optional[XPath] +) -> Optional[Bounds]: + def get_sorted_indices(nodes: List[lxml.html.HtmlElement]) -> List[int]: + return sorted([dom.get_index(node) for node in nodes]) + + # the getitem on upper_boundary_selector ensures that this throws an exception, if there are no + # upper_boundary_node present, as well as removing excess ones. + upper_boundary_nodes = [upper_boundary_selector(dom.root)[0]] + paragraph_nodes = paragraph_selector(dom.root) + lower_boundary_nodes = lower_boundary_selector(dom.root) if lower_boundary_selector else [] + + sorted_indices = get_sorted_indices(upper_boundary_nodes + paragraph_nodes + lower_boundary_nodes) + + if len(sorted_indices) < 2: + return None + + return Bounds( + upper=sorted_indices[0], + first_paragraph=paragraph_indices[0] if (paragraph_indices := get_sorted_indices(paragraph_nodes)) else None, + lower=sorted_indices[-1], + ) + + +_og_url_selector = XPath("string(//meta[@property='og:url']/@content)") + + +def image_extraction( + doc: lxml.html.HtmlElement, + paragraph_selector: XPath, + image_selector: XPath = XPath("//figure//img"), + upper_boundary_selector: XPath = XPath("//main"), + lower_boundary_selector: Optional[XPath] = None, + caption_selector: XPath = XPath("./ancestor::figure//figcaption"), + alt_selector: XPath = XPath("./@alt"), + author_selector: Union[XPath, Pattern[str]] = XPath( + "(./ancestor::figure//*[(contains(@class, 'copyright') or contains(@class, 'credit')) and text()])[1]" + ), + relative_urls: Union[bool, XPath] = False, + size_pattern: Pattern[str] = re.compile( + r"width([=-])(?P[0-9.]+)|height([=-])(?P[0-9.]+)|dpr=(?P[0-9.]+|)" + ), +) -> List[Image]: + """Extracts images enriched with metadata from based on given selectors. + + The core idea behind this function is to select all images matching that lay between + the first element selected by and the last element of + or . The hierarchy is determined by indexing all nodes of depth first. + To enrich the selected images with metadata like 'caption', 'alt-description' or `authors`, one should make + use of the corresponding selectors. + + Args: + doc: The html document of the article. + paragraph_selector: Selector used to select the paragraphs of the article. + image_selector: Selector selecting all relevant img elements. Defaults '//figure//img'. + upper_boundary_selector: A selector referencing an element to be considered as the upper boundary. All img + elements before this element will be ignored. + lower_boundary_selector: A selector referencing an element to be considered as the lower boundary. All img + elements after this element will be ignored. Defaults to the last paragraph of an article. + caption_selector: Selector selecting the caption of an image. Defaults to selecting the figcaption element. + alt_selector: Selector selecting the descriptive text of an image. Defaults to selecting alt value. + author_selector: Selector selecting the credits for an image. Defaults to selecting an arbitrary child of + figure with copyright or credit in its class attribute. + relative_urls: If True, the extractor assumes that image src URLs are relative and prepends the publisher + domain + size_pattern: Regular expression to select , and from the image URL. The given regExp + will be matched with re.findall and overwrites existing values. Defaults to None. + + Returns: + A list of Images contained within the article + + """ + + # index nodes df + dom = DOM(doc) + + # determine bounds based on df index + if not (bounds := determine_bounds(dom, paragraph_selector, upper_boundary_selector, lower_boundary_selector)): + raise ValueError("Bounds could not be determined") + + if relative_urls: + if isinstance(relative_urls, bool): + selector = _og_url_selector + else: + selector = relative_urls + if not (domain := selector(dom.root)): + raise ValueError("Could not determine domain") + else: + domain = None + + image_nodes = [ + IndexedImageNode(position=position, content=node, is_cover=position < (bounds.first_paragraph or 0)) + for node in image_selector(doc) + if bounds.upper < (position := dom.get_index(node)) < bounds.lower + ] + + images = list( + parse_image_nodes( + image_nodes=image_nodes, + caption_selector=caption_selector, + alt_selector=alt_selector, + author_selector=author_selector, + domain=domain, + size_pattern=size_pattern, + ) + ) + + return images diff --git a/src/fundus/publishers/at/derstandard.py b/src/fundus/publishers/at/derstandard.py index 251a9395b..c00f48ba9 100644 --- a/src/fundus/publishers/at/derstandard.py +++ b/src/fundus/publishers/at/derstandard.py @@ -1,13 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +41,12 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=XPath("./ancestor::figure//footer"), + size_pattern=re.compile(r"/rs:fill:(?P[0-9]+):"), + ) diff --git a/src/fundus/publishers/at/orf.py b/src/fundus/publishers/at/orf.py index 7122fdccc..fcf34a56c 100644 --- a/src/fundus/publishers/at/orf.py +++ b/src/fundus/publishers/at/orf.py @@ -3,11 +3,12 @@ from lxml.cssselect import CSSSelector -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +38,10 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + ) diff --git a/src/fundus/publishers/au/nine_news.py b/src/fundus/publishers/au/nine_news.py index 0ba36d45f..23ea67409 100644 --- a/src/fundus/publishers/au/nine_news.py +++ b/src/fundus/publishers/au/nine_news.py @@ -3,11 +3,12 @@ from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -43,3 +44,12 @@ def title(self) -> Optional[str]: @attribute def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/text()[1]"), + author_selector=XPath("./ancestor::figure//figcaption/text()[last()]"), + ) diff --git a/src/fundus/publishers/au/west_australian.py b/src/fundus/publishers/au/west_australian.py index 634103f1f..8baefe8db 100644 --- a/src/fundus/publishers/au/west_australian.py +++ b/src/fundus/publishers/au/west_australian.py @@ -2,14 +2,16 @@ import re from typing import List, Optional +from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, function -from fundus.parser.data import ArticleSection, TextSequence +from fundus.parser.data import ArticleSection, Image, TextSequence from fundus.parser.utility import ( generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, parse_json, ) @@ -52,3 +54,14 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=CSSSelector("div#ArticleContent > p"), + upper_boundary_selector=CSSSelector("article"), + lower_boundary_selector=CSSSelector("div#footer"), + caption_selector=XPath("./ancestor::figure //span[contains(@class, 'CaptionText')] /span[1]"), + author_selector=XPath("./ancestor::figure //span[contains(@class, 'CaptionText')] /span[last()]"), + ) diff --git a/src/fundus/publishers/ca/cbc_news.py b/src/fundus/publishers/ca/cbc_news.py index c615469d7..9fa84de94 100644 --- a/src/fundus/publishers/ca/cbc_news.py +++ b/src/fundus/publishers/ca/cbc_news.py @@ -7,11 +7,13 @@ from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute from fundus.parser.base_parser import function +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, extract_json_from_dom, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -64,3 +66,15 @@ def topics(self) -> List[str]: topic_list.append(re.sub(r".*/", "", path)) return topic_list + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@data-cy='storyWrapper']"), + caption_selector=XPath( + "./ancestor::figure//figcaption | ./ancestor::span[contains(@class,'mediaEmbed')]/span" + ), + author_selector=re.compile(r"\((?P.*?)\)$"), + ) diff --git a/src/fundus/publishers/ca/globe_and_mail.py b/src/fundus/publishers/ca/globe_and_mail.py index a5e8318c7..c4a69db9a 100644 --- a/src/fundus/publishers/ca/globe_and_mail.py +++ b/src/fundus/publishers/ca/globe_and_mail.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -47,3 +49,12 @@ def topics(self) -> List[str]: for duplicate in topic_duplicates: topic_list.remove(duplicate) return [topic.title() for topic in topic_list if "news" not in topic] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption//p[@class='figcap-text']/span[1]"), + author_selector=XPath("./ancestor::figure//figcaption//p[@class='figcap-text']/span[last()]"), + ) diff --git a/src/fundus/publishers/ca/national_post.py b/src/fundus/publishers/ca/national_post.py index 3d9ed7d94..8de192eb4 100644 --- a/src/fundus/publishers/ca/national_post.py +++ b/src/fundus/publishers/ca/national_post.py @@ -5,11 +5,13 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) from fundus.scraping.filter import regex_filter @@ -57,3 +59,12 @@ def topics(self) -> List[str]: topic for topic in preliminary_topics if not topic_filter(topic) and topic not in filter_list ] return generic_topic_parsing(filtered_topics) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@class='article-header__detail']/figure"), + lower_boundary_selector=CSSSelector("section.article-delimiter"), + ) diff --git a/src/fundus/publishers/ch/nzz.py b/src/fundus/publishers/ch/nzz.py index 2e28eb2f9..9755e60e1 100644 --- a/src/fundus/publishers/ch/nzz.py +++ b/src/fundus/publishers/ch/nzz.py @@ -3,13 +3,16 @@ from typing import List, Optional, Pattern from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -48,3 +51,14 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//h2"), + author_selector=XPath("./ancestor::figure//div[@class='image-description__author']"), + upper_boundary_selector=CSSSelector("div#page"), + lower_boundary_selector=XPath("//div[@class='sharebox']"), + ) diff --git a/src/fundus/publishers/ch/srf.py b/src/fundus/publishers/ch/srf.py index 4900644d4..6c4dfea1b 100644 --- a/src/fundus/publishers/ch/srf.py +++ b/src/fundus/publishers/ch/srf.py @@ -5,10 +5,12 @@ from lxml.html import HtmlElement from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -66,3 +68,14 @@ def title(self) -> Optional[str]: else: node: HtmlElement = title_node[0] return node.text_content() + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//span[@class='media-caption__description']"), + author_selector=XPath("./ancestor::figure//span[@class='media-caption__source']"), + image_selector=XPath("//picture[@class='image ']//img"), + lower_boundary_selector=XPath("(//div[@class='sharing-bar__container'])[2]"), + ) diff --git a/src/fundus/publishers/ch/ta.py b/src/fundus/publishers/ch/ta.py index d02c9bb13..39855ac03 100644 --- a/src/fundus/publishers/ch/ta.py +++ b/src/fundus/publishers/ch/ta.py @@ -2,12 +2,14 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -42,3 +44,13 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/div[1]"), + author_selector=XPath("./ancestor::figure//figcaption/div[2]"), + lower_boundary_selector=CSSSelector("div.EndOfArticleSectionList_root__LJO7G"), + ) diff --git a/src/fundus/publishers/cn/people.py b/src/fundus/publishers/cn/people.py index a5fa0f26a..aa487157d 100644 --- a/src/fundus/publishers/cn/people.py +++ b/src/fundus/publishers/cn/people.py @@ -3,13 +3,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, parse_title_from_root, ) @@ -43,3 +45,13 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=" ") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//img"), + upper_boundary_selector=XPath("//div[@class='layout route cf']"), + relative_urls=XPath("string((//head//link[@rel='stylesheet'])[1]/@href)"), + ) diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index c603bfaaa..e70afb545 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -113,7 +113,7 @@ class DE(metaclass=PublisherGroup): Sitemap("https://www.welt.de/sitemaps/sitemap/sitemap.xml"), NewsMap("https://www.welt.de/sitemaps/newssitemap/newssitemap.xml"), ], - url_filter=regex_filter("/Anlegertipps-|/videos[0-9]{2}"), + url_filter=regex_filter("/Anlegertipps-|/videos?[0-9]{2}|/mediathek/"), ) MDR = Publisher( @@ -389,6 +389,7 @@ class DE(metaclass=PublisherGroup): Sitemap("https://de.euronews.com/sitemaps/de/articles.xml"), NewsMap("https://de.euronews.com/sitemaps/de/latest-news.xml"), ], + url_filter=regex_filter("/video/"), ) Hessenschau = Publisher( @@ -407,6 +408,7 @@ class DE(metaclass=PublisherGroup): domain="https://www1.wdr.de/", parser=WDRParser, sources=[RSSFeed("https://www1.wdr.de/uebersicht-100.feed")], + url_filter=inverse(regex_filter("wdr.de/(?!mediathek/)")), ) BR = Publisher( @@ -448,6 +450,7 @@ class DE(metaclass=PublisherGroup): RSSFeed("https://follow.it/der-postillon-abo"), Sitemap("https://www.der-postillon.com/sitemap.xml"), ], + url_filter=regex_filter("https://follow.it/"), ) Kicker = Publisher( @@ -461,7 +464,7 @@ class DE(metaclass=PublisherGroup): ), NewsMap("https://newsfeed.kicker.de/googlesitemapnews.xml"), ], - url_filter=regex_filter("/slideshow|/video"), + url_filter=regex_filter("/slideshow|/video|heute-live|live-konferenz|/bilder|/ticker"), ) Krautreporter = Publisher( diff --git a/src/fundus/publishers/de/berliner_zeitung.py b/src/fundus/publishers/de/berliner_zeitung.py index 0bf924041..4d4b806c1 100644 --- a/src/fundus/publishers/de/berliner_zeitung.py +++ b/src/fundus/publishers/de/berliner_zeitung.py @@ -2,13 +2,16 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +45,19 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[not(contains(@class, 'author') or contains(@class, 'preview'))]/img"), + caption_selector=XPath( + "./ancestor::div[@class='article_image-container__Yo6Cx']" + "//span[@class='article_image-container-caption__lZ5kc']" + ), + author_selector=XPath( + "./ancestor::div[@class='article_image-container__Yo6Cx']" + "//span[@class='article_image-container-source__rbsO4']" + ), + ) diff --git a/src/fundus/publishers/de/bild.py b/src/fundus/publishers/de/bild.py index 168ccc57c..378754ed7 100644 --- a/src/fundus/publishers/de/bild.py +++ b/src/fundus/publishers/de/bild.py @@ -5,11 +5,13 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -50,3 +52,13 @@ def free_access(self) -> bool: return re.search(r"/bild-plus/", url) is None else: return True + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure//img[not(contains(@class, 'teaser') or contains(@class, 'author'))]"), + caption_selector=XPath("./ancestor::figure//p[@class='fig__caption__text']"), + author_selector=XPath("./ancestor::figure//div[@class='fig__caption__meta']"), + ) diff --git a/src/fundus/publishers/de/boersenzeitung.py b/src/fundus/publishers/de/boersenzeitung.py index e8e0ed84c..3dbf67272 100644 --- a/src/fundus/publishers/de/boersenzeitung.py +++ b/src/fundus/publishers/de/boersenzeitung.py @@ -5,11 +5,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -58,6 +59,16 @@ def free_access(self) -> bool: # print(self._paywall_selector(self.precomputed.doc).text_content().strip()) return not [node.text_content().strip() for node in self._paywall_selector(self.precomputed.doc)] + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//h1|//script"), + image_selector=XPath("//storefront-image|//figure//img"), + author_selector=XPath("./ancestor::storefront-section//storefront-html[@class='image-copyright']"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/de/br.py b/src/fundus/publishers/de/br.py index 266fe81b3..8f443ae81 100644 --- a/src/fundus/publishers/de/br.py +++ b/src/fundus/publishers/de/br.py @@ -1,15 +1,18 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -52,6 +55,19 @@ def body(self) -> Optional[ArticleBody]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + @attribute + def images(self) -> List[Image]: + author_pattern: str = r"(?<=\|\sBild:\s).*$" + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure[not(parent::aside)]//img"), + author_selector=XPath( + f"re:match(./@title, '{author_pattern}')", + namespaces={"re": "http://exslt.org/regular-expressions"}, + ), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/de/braunschweiger_zeitung.py b/src/fundus/publishers/de/braunschweiger_zeitung.py index 7d148b8f3..f3854f8c1 100644 --- a/src/fundus/publishers/de/braunschweiger_zeitung.py +++ b/src/fundus/publishers/de/braunschweiger_zeitung.py @@ -5,12 +5,14 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -64,3 +66,14 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=XPath( + "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'])]" + ), + image_selector=XPath("//img[not(contains(@class, 'rounded-full'))]"), + author_selector=re.compile(r"©(?P.*)"), + ) diff --git a/src/fundus/publishers/de/business_insider_de.py b/src/fundus/publishers/de/business_insider_de.py index fe0c2a26d..14b7a334a 100644 --- a/src/fundus/publishers/de/business_insider_de.py +++ b/src/fundus/publishers/de/business_insider_de.py @@ -5,11 +5,13 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -56,3 +58,11 @@ def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) or generic_topic_parsing( self.precomputed.ld.bf_search("keywords") ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//img[not(contains(@class, 'size-thumbnail-square'))]"), + ) diff --git a/src/fundus/publishers/de/die_welt.py b/src/fundus/publishers/de/die_welt.py index aa10ee7e9..b922baf04 100644 --- a/src/fundus/publishers/de/die_welt.py +++ b/src/fundus/publishers/de/die_welt.py @@ -5,13 +5,14 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -51,6 +52,18 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=CSSSelector("figure:not(.c-inline-video) img"), + caption_selector=XPath("./ancestor::figure//span[@class='c-content-image__caption-alt']"), + author_selector=XPath("./ancestor::figure//span[@class='c-content-image__caption-source']"), + lower_boundary_selector=XPath("//section[@class='c-attached-content']"), + size_pattern=re.compile(r"-w(?P[0-9]+)/"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/de/die_zeit.py b/src/fundus/publishers/de/die_zeit.py index 772e37e6d..c1fdf64ed 100644 --- a/src/fundus/publishers/de/die_zeit.py +++ b/src/fundus/publishers/de/die_zeit.py @@ -5,13 +5,14 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -48,3 +49,14 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure//img[@class='article__media-item']"), + caption_selector=XPath("./ancestor::figure//span[@class='figure__text']"), + author_selector=XPath("./ancestor::figure//span[@class='figure__copyright']"), + lower_boundary_selector=XPath("//nav[@class='breadcrumbs']"), + ) diff --git a/src/fundus/publishers/de/dw.py b/src/fundus/publishers/de/dw.py index fa4a1ab4e..907ce83a3 100644 --- a/src/fundus/publishers/de/dw.py +++ b/src/fundus/publishers/de/dw.py @@ -5,7 +5,7 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, @@ -13,6 +13,7 @@ generic_date_parsing, generic_text_extraction_with_css, generic_topic_parsing, + image_extraction, strip_nodes_to_text, ) @@ -60,6 +61,9 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return [node.text_content().strip() for node in self._topic_selector(self.precomputed.doc)] + # As of now, images cannot reliably be implemented for DW, since all pictures in the article, are loaded + # dynamically with URLs like 'https://static.dw.com/image/65166768_${formatId}.jpg' + class V2_1(V2): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/de/faz.py b/src/fundus/publishers/de/faz.py index 84043b687..71921f8ed 100644 --- a/src/fundus/publishers/de/faz.py +++ b/src/fundus/publishers/de/faz.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, parse_title_from_root, strip_nodes_to_text, ) @@ -57,6 +58,8 @@ def authors(self) -> List[str]: def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + # As of now, images can't be implemented for FAZ, since they are not crawled by CC-Bot + class V2(BaseParser): _summary_selector = CSSSelector("div.header-teaser") _paragraph_selector = CSSSelector(".body-elements__paragraph") @@ -95,3 +98,13 @@ def authors(self) -> List[str]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") or parse_title_from_root(self.precomputed.doc) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure//img|//picture//img"), + caption_selector=XPath("./ancestor::figure//span"), + author_selector=XPath("./ancestor::figure//em"), + ) diff --git a/src/fundus/publishers/de/focus.py b/src/fundus/publishers/de/focus.py index 4908669fa..f10ae8821 100644 --- a/src/fundus/publishers/de/focus.py +++ b/src/fundus/publishers/de/focus.py @@ -5,11 +5,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -65,3 +66,20 @@ def topics(self) -> List[str]: topic_names: List[str] = re.findall(self._topic_name_pattern, match.group(1)) return topic_names + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@class='image clearfix']//img|//figure//img"), + caption_selector=XPath( + "./ancestor::div[@class='image clearfix']//span[@class='caption']|" + "./ancestor::figure//span[@class='Image-Caption']" + ), + author_selector=XPath( + "./ancestor::div[@class='image clearfix']//span[@class='source']|" + "./ancestor::figure//span[@class='Image-Credit']" + ), + lower_boundary_selector=XPath("//footer"), + ) diff --git a/src/fundus/publishers/de/frankfurter_rundschau.py b/src/fundus/publishers/de/frankfurter_rundschau.py index 98d5901bd..815ac24f0 100644 --- a/src/fundus/publishers/de/frankfurter_rundschau.py +++ b/src/fundus/publishers/de/frankfurter_rundschau.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -44,3 +46,12 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article"), + author_selector=re.compile(r"©(?P.+)"), + ) diff --git a/src/fundus/publishers/de/freiepresse.py b/src/fundus/publishers/de/freiepresse.py index b830d2b8f..38665406b 100644 --- a/src/fundus/publishers/de/freiepresse.py +++ b/src/fundus/publishers/de/freiepresse.py @@ -1,21 +1,27 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) class FreiePresseParser(ParserProxy): class V1(BaseParser): + VALID_UNTIL = datetime.date(2024, 8, 4) _summary_selector = CSSSelector("#artikel-content > p.bold") - _paragraph_selector = CSSSelector("#artikel-content p:not(.bold)") + _paragraph_selector = XPath( + "//*[@id='artikel-content']//p[not(ancestor::div[@class='pw-layer'] or @class='bold')]" + ) _subheadline_selector = CSSSelector("#artikel-content h2") @attribute @@ -33,12 +39,53 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) + if not (authors := self.precomputed.ld.xpath_search("NewsArticle/author")): + return [] + else: + return generic_author_parsing( + [author for author in authors if not author == "Chemnitzer Verlag und Druck GmbH & Co. KG"] + ) @attribute def title(self) -> Optional[str]: - return self.precomputed.meta.get("og:title") + if title := self.precomputed.meta.get("og:title"): + return re.sub(r"\s*\|.*", "", title) + return None @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords"), delimiter="/") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "((//div[contains(@class,'wrapImg')]//picture)[1])//img | //img[@class='media-image']" + ), + lower_boundary_selector=XPath("//div[@class='section-topic']"), + caption_selector=XPath("./ancestor::li[@class='img gallery-item']//span[@class='img-info']"), + author_selector=re.compile(r"(?i)bild:(?P.*)"), + relative_urls=True, + ) + + class V1_1(V1): + VALID_UNTIL = datetime.date.today() + _paragraph_selector = CSSSelector("#artikel-content p:not(.bold)") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//div[@class='detail-img__image-wrapper detail-img__image-wrapper--gradient']//img" + ), + lower_boundary_selector=CSSSelector("a.article__copyright"), + caption_selector=XPath( + "./ancestor::div[@class='detail-img']//div[@class='detail-img__description no-transition']/div/text()" + ), + author_selector=re.compile(r"(?i)bild:(?P.*)"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/gamestar.py b/src/fundus/publishers/de/gamestar.py index 22ed99130..31ad6990b 100644 --- a/src/fundus/publishers/de/gamestar.py +++ b/src/fundus/publishers/de/gamestar.py @@ -1,13 +1,16 @@ +import re from datetime import datetime from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +40,16 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@class='main waypoint']"), + image_selector=XPath("//picture/img"), + caption_selector=XPath("./ancestor::p[@class='caption ']/span[@class='bu m-t-1']"), + lower_boundary_selector=XPath("//div[@id='comments']"), + author_selector=re.compile("(?i)Bildquelle:(?P.*)"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/golem.py b/src/fundus/publishers/de/golem.py index 3571610f6..bd1b8c8dd 100644 --- a/src/fundus/publishers/de/golem.py +++ b/src/fundus/publishers/de/golem.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -50,3 +52,12 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + author_selector=re.compile(r"(?i)\(bild:(?P.*)\)"), + ) diff --git a/src/fundus/publishers/de/hamburger_abendblatt.py b/src/fundus/publishers/de/hamburger_abendblatt.py index 4b8009655..29d7d734e 100644 --- a/src/fundus/publishers/de/hamburger_abendblatt.py +++ b/src/fundus/publishers/de/hamburger_abendblatt.py @@ -6,11 +6,13 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -51,3 +53,14 @@ def topics(self) -> List[str]: re.sub(r"\s*–.+", "", node.text_content()).strip() for node in self._topics_selector(self.precomputed.doc) ] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=XPath( + "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'])]" + ), + image_selector=XPath("//img[not(contains(@class, 'rounded-full'))]"), + author_selector=re.compile(r"©(?P.*)"), + ) diff --git a/src/fundus/publishers/de/heise.py b/src/fundus/publishers/de/heise.py index 5adc2f4e0..ef1155156 100644 --- a/src/fundus/publishers/de/heise.py +++ b/src/fundus/publishers/de/heise.py @@ -1,14 +1,16 @@ +import re from datetime import datetime from typing import List, Optional from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -61,3 +63,30 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + # There some (rare) cases there are some images that are not being extracted with this, because they are + # referenced by relative URLs. e.g. + # https://www.heise.de/hintergrund/Zahlen-bitte-136199-Eris-Der-Grund-warum-Pluto-kein-Planet-mehr-ist-9993800.html + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath( + "//h1[@class='article-headline ' or contains(@class, 'a-article-header__title')]" + ), + image_selector=XPath( + "//div[@class='article-image__gallery-container']//img|" + "//div[@class='image-container']//img|" + "//div[@class='article-layout__content']//figure[not(@class)]//noscript//img" + ), + caption_selector=XPath( + "./ancestor::figure//p[@class='a-caption__text']|" + "./ancestor::figure//div[@class='text']|" + "./ancestor::div[@class='article-gallery ']//span[@class='caption']" + ), + author_selector=XPath( + "./ancestor::figure//p[@class='a-caption__source']|" + "./ancestor::div[@class='article-gallery ']//span[@class='copyright']" + ), + ) diff --git a/src/fundus/publishers/de/hessenschau.py b/src/fundus/publishers/de/hessenschau.py index e8b4b5a88..91b23e66d 100644 --- a/src/fundus/publishers/de/hessenschau.py +++ b/src/fundus/publishers/de/hessenschau.py @@ -1,15 +1,17 @@ +import re from datetime import datetime from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -51,3 +53,13 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure[not(@class='ar-1-1')]//*[not(self::noscript)]/img"), + caption_selector=XPath("./ancestor::figure//span[@class='pr-3']"), + author_selector=XPath("./ancestor::figure//span[@class='text-gray-scorpion dark:text-text-dark']"), + ) diff --git a/src/fundus/publishers/de/junge_welt.py b/src/fundus/publishers/de/junge_welt.py index 43261092b..586268101 100644 --- a/src/fundus/publishers/de/junge_welt.py +++ b/src/fundus/publishers/de/junge_welt.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -50,3 +51,12 @@ def free_access(self) -> bool: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//div[contains(@class, 'caption')]"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/kicker.py b/src/fundus/publishers/de/kicker.py index a88c4a8a2..dcaad0106 100644 --- a/src/fundus/publishers/de/kicker.py +++ b/src/fundus/publishers/de/kicker.py @@ -2,12 +2,14 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +39,16 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + image_selector=XPath( + "//*[contains(@class,'kick__article__picture') and not(contains(@class, 'medias'))]//img" + ), + caption_selector=XPath("./ancestor::*[contains(@class, 'kick__article__picture ')]//p/text()"), + author_selector=XPath("./ancestor::*[contains(@class, 'kick__article__picture ')]//p/span"), + ) diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py index 292ad340f..f46e21ac4 100644 --- a/src/fundus/publishers/de/krautreporter.py +++ b/src/fundus/publishers/de/krautreporter.py @@ -4,7 +4,15 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility +from fundus.parser import ( + ArticleBody, + BaseParser, + Image, + ParserProxy, + attribute, + utility, +) +from fundus.parser.utility import image_extraction class KrautreporterParser(ParserProxy): @@ -49,3 +57,19 @@ def publishing_date(self) -> Optional[datetime]: @attribute def topics(self) -> List[str]: return utility.generic_topic_parsing(self._topic_selector(self.precomputed.doc)) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//section[@class='article-headers-shared-teaser-image']//img|" + "//figure[contains(@class, 'image--default')]//img" + ), + author_selector=XPath( + "./ancestor::section[@class='article-headers-shared-teaser-image']" + "//p[@class='article-headers-shared-teaser-image__credits']" + ), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py index 79d762c3f..579864515 100644 --- a/src/fundus/publishers/de/mdr.py +++ b/src/fundus/publishers/de/mdr.py @@ -5,13 +5,14 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_date_parsing, generic_text_extraction_with_css, generic_topic_parsing, + image_extraction, ) @@ -61,3 +62,14 @@ def authors(self) -> List[str]: @attribute def title(self) -> Optional[str]: return title if isinstance(title := self.precomputed.ld.bf_search("headline"), str) else None + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@id='content']"), + image_selector=XPath("//div[contains(@class,'mediaCon ') and not(@data-ctrl-player)]//noscript/img"), + caption_selector=XPath("./ancestor::div[@class='media mediaA ']//span[@class='mediaSubtitle']"), + author_selector=XPath("./ancestor::div[@class='media mediaA ']//span[@class='mediaRights copyright']"), + ) diff --git a/src/fundus/publishers/de/merkur.py b/src/fundus/publishers/de/merkur.py index 8eb9e9da9..304ae9985 100644 --- a/src/fundus/publishers/de/merkur.py +++ b/src/fundus/publishers/de/merkur.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -38,3 +40,13 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + image_selector=XPath("//figure[@class='id-StoryElement-image']//img"), + author_selector=re.compile(r"©(?P.+)"), + ) diff --git a/src/fundus/publishers/de/morgenpost_berlin.py b/src/fundus/publishers/de/morgenpost_berlin.py index f33cf71ab..56ebfed2d 100644 --- a/src/fundus/publishers/de/morgenpost_berlin.py +++ b/src/fundus/publishers/de/morgenpost_berlin.py @@ -6,11 +6,13 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -51,3 +53,14 @@ def topics(self) -> List[str]: re.sub(r"\s*–.+", "", node.text_content()).strip() for node in self._topics_selector(self.precomputed.doc) ] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=XPath( + "//div[@class='article-body']//p[not(not(text()) or @rel='author' or em[@class='print'])]" + ), + image_selector=XPath("//img[not(contains(@class, 'rounded-full'))]"), + author_selector=re.compile(r"©(?P.*)"), + ) diff --git a/src/fundus/publishers/de/motorsport_magazin.py b/src/fundus/publishers/de/motorsport_magazin.py index d50697f90..35340feeb 100644 --- a/src/fundus/publishers/de/motorsport_magazin.py +++ b/src/fundus/publishers/de/motorsport_magazin.py @@ -1,14 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -45,3 +48,14 @@ def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) else: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//*[not(self::figure)]/picture//img"), + caption_selector=XPath("(./ancestor::picture/following-sibling::figcaption)[1]"), + author_selector=re.compile(r"(?i),?\s*foto:(?P.+)"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/mz.py b/src/fundus/publishers/de/mz.py index c925172f1..219ebd014 100644 --- a/src/fundus/publishers/de/mz.py +++ b/src/fundus/publishers/de/mz.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,16 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath( + "./ancestor::div[@class='key-visual-image-wrapper']//span[@data-fp-flag='main-image-caption']" + ), + author_selector=XPath( + "./ancestor::div[@class='key-visual-image-wrapper']//span[@data-fp-flag='main-image-source']" + ), + ) diff --git a/src/fundus/publishers/de/ndr.py b/src/fundus/publishers/de/ndr.py index 08a73fdee..1845bb6a2 100644 --- a/src/fundus/publishers/de/ndr.py +++ b/src/fundus/publishers/de/ndr.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -46,3 +48,17 @@ def authors(self) -> List[str]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@id='page']"), + image_selector=XPath( + "//div[@id='page']//*[(self::div and not(@class='teaserimage')) or (self::a and @class='zoomimage')]/div[contains(@class,'image-container')]//picture//img" + ), + relative_urls=XPath("string(//link[@rel='canonical']/@href)"), + caption_selector=XPath("./ancestor::div[contains(@class,'contentimage')]//span[@class='caption']"), + author_selector=re.compile(r"(?i)©\s*(ndr)?\s*(foto)?:?\s*(?P.+)"), + ) diff --git a/src/fundus/publishers/de/netzpolitik_org.py b/src/fundus/publishers/de/netzpolitik_org.py index c670b74f2..b49c9f04c 100644 --- a/src/fundus/publishers/de/netzpolitik_org.py +++ b/src/fundus/publishers/de/netzpolitik_org.py @@ -1,13 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_date_parsing, generic_topic_parsing, + image_extraction, parse_title_from_root, strip_nodes_to_text, ) @@ -48,3 +51,12 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: return [node.text_content() for node in (self._author_selector(self.precomputed.doc) or [])] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/text()"), + author_selector=XPath("./ancestor::figure//figcaption/span"), + ) diff --git a/src/fundus/publishers/de/ntv.py b/src/fundus/publishers/de/ntv.py index 28aad5478..deef8f9c2 100644 --- a/src/fundus/publishers/de/ntv.py +++ b/src/fundus/publishers/de/ntv.py @@ -5,13 +5,14 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -51,6 +52,17 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure[not(contains(@class, 'teaser'))]//picture/img"), + upper_boundary_selector=XPath("//article[@class='article']"), + caption_selector=XPath("./ancestor::figure//figcaption/p[@class='article__caption']"), + author_selector=XPath("./ancestor::figure//figcaption/p[@class='article__credit']"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() _author_selector = XPath("string(//span[@class='article__author'])") diff --git a/src/fundus/publishers/de/postillon.py b/src/fundus/publishers/de/postillon.py index 210656b49..3a84d4f37 100644 --- a/src/fundus/publishers/de/postillon.py +++ b/src/fundus/publishers/de/postillon.py @@ -2,11 +2,13 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_date_parsing, + image_extraction, ) @@ -37,3 +39,11 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@class='entry-content-wrap flex-col']//img"), + ) diff --git a/src/fundus/publishers/de/rheinische_post.py b/src/fundus/publishers/de/rheinische_post.py index 48ebdc4be..fb95f2960 100644 --- a/src/fundus/publishers/de/rheinische_post.py +++ b/src/fundus/publishers/de/rheinische_post.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure[@id]//img[not(@alt='Platzhalter Drittanbieter-Inhalt')]"), + caption_selector=XPath("./ancestor::figure//figcaption/p"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/de/rn.py b/src/fundus/publishers/de/rn.py index 0f3322a85..3d92d9c3c 100644 --- a/src/fundus/publishers/de/rn.py +++ b/src/fundus/publishers/de/rn.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + image_selector=XPath("//figure[not(@class='teaser__thumbnail')]//img"), + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/text()"), + author_selector=XPath("./ancestor::figure//figcaption/span"), + ) diff --git a/src/fundus/publishers/de/spon.py b/src/fundus/publishers/de/spon.py index 3643eeedc..18db11fca 100644 --- a/src/fundus/publishers/de/spon.py +++ b/src/fundus/publishers/de/spon.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,19 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("news_keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + lower_boundary_selector=XPath("//footer"), + image_selector=XPath("//figure//picture//img"), + caption_selector=XPath( + "./ancestor::figure/following-sibling::figcaption[1]//p|" "./ancestor::figure/figcaption[1]//p" + ), + author_selector=XPath( + "./ancestor::figure/following-sibling::figcaption[1]/span|" + "./ancestor::figure/figcaption[1]/*[(self::span or self::div) and contains(@class,'Credit')]" + ), + ) diff --git a/src/fundus/publishers/de/sportschau.py b/src/fundus/publishers/de/sportschau.py index 82d9bca41..adb5d199b 100644 --- a/src/fundus/publishers/de/sportschau.py +++ b/src/fundus/publishers/de/sportschau.py @@ -1,14 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -48,3 +51,18 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//article//picture[not(contains(@class,'--list'))]//img"), + lower_boundary_selector=XPath("//div[contains(@class, 'back-to-top')]"), + alt_selector=XPath("./@title"), + author_selector=re.compile(r"\|(?P.+)"), + caption_selector=XPath( + "./ancestor::div[contains(@class, 'absatzbild ')]/div[@class='absatzbild__info']" + ), + size_pattern=re.compile(r"/[\dx]+-(?P[0-9]+)/"), + ) diff --git a/src/fundus/publishers/de/stern.py b/src/fundus/publishers/de/stern.py index 68c613b05..c1b3cb638 100644 --- a/src/fundus/publishers/de/stern.py +++ b/src/fundus/publishers/de/stern.py @@ -4,13 +4,14 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_nodes_to_text, generic_topic_parsing, + image_extraction, ) @@ -53,6 +54,17 @@ def topics(self) -> List[str]: topic_nodes = self.precomputed.doc.cssselect(".article__tags li.links__item") return [node.text_content().strip("\n ") for node in topic_nodes] + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + image_selector=XPath("//figure[not(contains(@class, 'teaser'))]//img"), + paragraph_selector=self._paragraph_selector, + lower_boundary_selector=CSSSelector(".article__tags li.links__item"), + caption_selector=XPath("./ancestor::figure//figcaption//div[contains(@class,'caption')]"), + author_selector=XPath("./ancestor::figure//figcaption//div[contains(@class,'credits')]"), + ) + class V2(BaseParser): _paragraph_selector = CSSSelector(".article__body > .text-element > p.is-initial") _summary_selector = CSSSelector(".article__body > .intro") @@ -87,3 +99,14 @@ def topics(self) -> List[str]: return generic_topic_parsing( generic_nodes_to_text(self._topic_selector(self.precomputed.doc), normalize=True) ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + image_selector=XPath("//figure[not(contains(@class, 'teaser'))]//img"), + paragraph_selector=self._paragraph_selector, + lower_boundary_selector=self._topic_selector, + caption_selector=XPath("./ancestor::figure//figcaption//div[contains(@class,'caption')]"), + author_selector=XPath("./ancestor::figure//figcaption//div[contains(@class,'credits')]"), + ) diff --git a/src/fundus/publishers/de/sz.py b/src/fundus/publishers/de/sz.py index 2f55a7ce2..68872c052 100644 --- a/src/fundus/publishers/de/sz.py +++ b/src/fundus/publishers/de/sz.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -45,6 +47,15 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/text()"), + author_selector=XPath("./ancestor::figure//figcaption/small"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() _paragraph_selector = XPath( diff --git a/src/fundus/publishers/de/tagesschau.py b/src/fundus/publishers/de/tagesschau.py index b56c95137..75716a914 100644 --- a/src/fundus/publishers/de/tagesschau.py +++ b/src/fundus/publishers/de/tagesschau.py @@ -5,11 +5,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -50,3 +51,17 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: topic_nodes = self._topic_selector(self.precomputed.doc) return [node.text_content() for node in topic_nodes] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//*[not(self::div and @class='teaser-absatz__image')]/div[@class='ts-picture__wrapper']//img" + ), + alt_selector=XPath("./@title"), + author_selector=re.compile(r"\|(?P.+)"), + caption_selector=XPath("./ancestor::div[contains(@class, 'absatzbild ')]"), + lower_boundary_selector=self._topic_selector, + ) diff --git a/src/fundus/publishers/de/tagesspiegel.py b/src/fundus/publishers/de/tagesspiegel.py index 8ef19bf2d..977d4ef9e 100644 --- a/src/fundus/publishers/de/tagesspiegel.py +++ b/src/fundus/publishers/de/tagesspiegel.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -43,3 +45,12 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + relative_urls=True, + author_selector=XPath("./ancestor::figure//p[contains(@class, 'tspQcl')]"), + ) diff --git a/src/fundus/publishers/de/taz.py b/src/fundus/publishers/de/taz.py index 3257a77e0..7261d2ad2 100644 --- a/src/fundus/publishers/de/taz.py +++ b/src/fundus/publishers/de/taz.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -73,3 +75,13 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("taz:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@data-for='picture']//img[not(@class)]"), + caption_selector=XPath("./ancestor::div[@data-for='picture']/figcaption/text()"), + author_selector=XPath("./ancestor::div[@data-for='picture']/figcaption/span"), + ) diff --git a/src/fundus/publishers/de/vogue_de.py b/src/fundus/publishers/de/vogue_de.py index 24f9b2172..b0fad64c9 100644 --- a/src/fundus/publishers/de/vogue_de.py +++ b/src/fundus/publishers/de/vogue_de.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -43,3 +44,15 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//article//*[not(self::a)]/picture[not(contains(@class, 'summary-item__image'))]//img" + ), + caption_selector=XPath("./ancestor::figure//span[contains(@class, 'caption__text')]"), + author_selector=XPath("./ancestor::figure//span[contains(@class, 'caption__credit')]"), + ) diff --git a/src/fundus/publishers/de/waz.py b/src/fundus/publishers/de/waz.py index b06376dbb..7195a29c2 100644 --- a/src/fundus/publishers/de/waz.py +++ b/src/fundus/publishers/de/waz.py @@ -5,12 +5,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -55,6 +56,16 @@ def topics(self) -> List[str]: for node in self._topics_selector(self.precomputed.doc) ] + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + lower_boundary_selector=XPath("//a[@href='/' and contains(text(), 'Startseite')]"), + caption_selector=XPath("(./ancestor::figure//figcaption//span)[1]"), + author_selector=XPath("(./ancestor::figure//figcaption//span)[2]"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/de/wdr.py b/src/fundus/publishers/de/wdr.py index b6885c4a9..f718a3bf0 100644 --- a/src/fundus/publishers/de/wdr.py +++ b/src/fundus/publishers/de/wdr.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -44,3 +46,19 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("Keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//article//picture[not(@data-resp-img-id='LinklistenteaserImageSectionZModA')]//img[@class='img']" + ), + upper_boundary_selector=XPath("//div[@class='segment']"), + lower_boundary_selector=XPath("//div[@class='shareCon']"), + alt_selector=XPath("./@title"), + author_selector=re.compile(r"(?i)\|\s*bildquelle:(?P.+)"), + relative_urls=True, + caption_selector=XPath("./ancestor::div[@class='media mediaA']//p[@class='infotext']"), + ) diff --git a/src/fundus/publishers/de/winfuture.py b/src/fundus/publishers/de/winfuture.py index 10566a394..72dd70f07 100644 --- a/src/fundus/publishers/de/winfuture.py +++ b/src/fundus/publishers/de/winfuture.py @@ -5,12 +5,13 @@ from lxml.etree import XPath from lxml.html import HtmlElement, fromstring, tostring -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -59,3 +60,17 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@class='primary_content']//img[@class='teaser_img' or @class='photo']"), + upper_boundary_selector=XPath("//div[@class='primary_content']"), + lower_boundary_selector=XPath("//div[@class='mb20 more_links']"), + caption_selector=XPath("./ancestor::span[contains(@class,'hmedia')]//a"), + author_selector=XPath( + "./ancestor::div[@class='teaser_img_container']//div[@class='teaser_img_source']" + ), + ) diff --git a/src/fundus/publishers/de/zdf.py b/src/fundus/publishers/de/zdf.py index 246a7c0a8..8d63b1a18 100644 --- a/src/fundus/publishers/de/zdf.py +++ b/src/fundus/publishers/de/zdf.py @@ -2,18 +2,20 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) class ZDFParser(ParserProxy): class V1(BaseParser): - _div_selector = CSSSelector("div.r1nj4qn5") + _paragraph_selector = CSSSelector("div.r1nj4qn5") _summary_selector = CSSSelector("p.ikh9v7p.c1bdz7f4") _subheadlines_selector = CSSSelector("h2.t1rbo974.hhhtovw") @@ -21,7 +23,7 @@ class V1(BaseParser): def body(self) -> Optional[ArticleBody]: return extract_article_body_with_selector( self.precomputed.doc, - paragraph_selector=self._div_selector, + paragraph_selector=self._paragraph_selector, summary_selector=self._summary_selector, subheadline_selector=self._subheadlines_selector, ) @@ -37,3 +39,15 @@ def authors(self) -> List[str]: @attribute def publishing_date(self) -> Optional[datetime.datetime]: return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//picture//img[not(contains(@class, 'error') or contains(@src, 'zdfheute-whatsapp-channel'))]" + ), + caption_selector=XPath("./ancestor::div[@class='c1owvrps c10o8fzf']//span[@class='c1pbsmr2']"), + lower_boundary_selector=XPath("//div[@class='s1am5zo f1uhhdhr']"), + ) diff --git a/src/fundus/publishers/es/el_pais.py b/src/fundus/publishers/es/el_pais.py index 60c4781b7..8e12d4d93 100644 --- a/src/fundus/publishers/es/el_pais.py +++ b/src/fundus/publishers/es/el_pais.py @@ -1,14 +1,16 @@ import datetime from typing import List, Optional +from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,15 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + lower_boundary_selector=CSSSelector("aside.a_com"), + image_selector=CSSSelector("figure.a_m img"), + caption_selector=XPath("./ancestor::figure//figcaption/span[1]"), + author_selector=XPath("./ancestor::figure//figcaption/span[last()]"), + ) diff --git a/src/fundus/publishers/fr/le_figaro.py b/src/fundus/publishers/fr/le_figaro.py index 7244dfd6d..0b2ac840b 100644 --- a/src/fundus/publishers/fr/le_figaro.py +++ b/src/fundus/publishers/fr/le_figaro.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,3 +44,14 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article"), + image_selector=XPath("//figure/descendant::img[1]"), + caption_selector=XPath("./ancestor::figure//figcaption/text()"), + author_selector=XPath("./ancestor::figure//figcaption/span"), + ) diff --git a/src/fundus/publishers/fr/le_monde.py b/src/fundus/publishers/fr/le_monde.py index ac2543717..96f4eacf6 100644 --- a/src/fundus/publishers/fr/le_monde.py +++ b/src/fundus/publishers/fr/le_monde.py @@ -4,11 +4,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -44,3 +45,14 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.meta.get("og:article:author")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article"), + image_selector=XPath("//figure/descendant::img[1]"), + caption_selector=XPath("./ancestor::figure//figcaption/text()"), + author_selector=XPath("./ancestor::figure//figcaption/span"), + ) diff --git a/src/fundus/publishers/fr/les_echos.py b/src/fundus/publishers/fr/les_echos.py index 1dfba452d..a250f313f 100644 --- a/src/fundus/publishers/fr/les_echos.py +++ b/src/fundus/publishers/fr/les_echos.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_nodes_to_text, + image_extraction, normalize_whitespace, ) @@ -57,3 +59,11 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=re.compile(r"\((?P.*?)\)$"), + ) diff --git a/src/fundus/publishers/ind/bhaskar.py b/src/fundus/publishers/ind/bhaskar.py index 15d5a273b..ea90bdfbe 100644 --- a/src/fundus/publishers/ind/bhaskar.py +++ b/src/fundus/publishers/ind/bhaskar.py @@ -4,12 +4,13 @@ from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -45,3 +46,13 @@ def topics(self) -> List[str]: for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) if not re.search(self._topic_bloat_pattern, topic) ] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//article//picture//img"), + upper_boundary_selector=XPath("//article"), + caption_selector=XPath("(./ancestor::div[@class='f3e032cb']/following-sibling::*[1])/span[text()]"), + ) diff --git a/src/fundus/publishers/ind/times_of_india.py b/src/fundus/publishers/ind/times_of_india.py index 759da0b47..f0343eaa7 100644 --- a/src/fundus/publishers/ind/times_of_india.py +++ b/src/fundus/publishers/ind/times_of_india.py @@ -2,16 +2,18 @@ import re from typing import List, Optional +from lxml.cssselect import CSSSelector from lxml.etree import XPath from lxml.html import fromstring, tostring -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( apply_substitution_pattern_over_list, extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -69,3 +71,17 @@ def topics(self) -> List[str]: for topic in generic_topic_parsing(self.precomputed.meta.get("news_keywords")) if "News" not in topic.title() ] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("div.contentwrapper.clearfix"), + lower_boundary_selector=CSSSelector("div.authorComment"), + image_selector=CSSSelector("section.leadmedia img"), + caption_selector=XPath( + "./ancestor::section[contains(@class, 'leadmedia')]//div[contains(@class, 'img_cptn')]" + ), + author_selector=re.compile(r"\((?P.*?)\)$"), + ) diff --git a/src/fundus/publishers/jp/the_japan_news.py b/src/fundus/publishers/jp/the_japan_news.py index 0a4793ba7..65b7390f7 100644 --- a/src/fundus/publishers/jp/the_japan_news.py +++ b/src/fundus/publishers/jp/the_japan_news.py @@ -4,12 +4,13 @@ from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -44,3 +45,12 @@ def topics(self) -> List[str]: re.sub(r"\([0-9]+\)", "", topic).strip() for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) ] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure/figcaption/text()"), + author_selector=XPath("./ancestor::figure/figcaption/span"), + ) diff --git a/src/fundus/publishers/jp/thejapannews.py b/src/fundus/publishers/jp/thejapannews.py deleted file mode 100644 index 0a4793ba7..000000000 --- a/src/fundus/publishers/jp/thejapannews.py +++ /dev/null @@ -1,46 +0,0 @@ -import datetime -import re -from typing import List, Optional - -from lxml.etree import XPath - -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute -from fundus.parser.utility import ( - extract_article_body_with_selector, - generic_author_parsing, - generic_date_parsing, - generic_topic_parsing, -) - - -class TheJapanNewsParser(ParserProxy): - class V1(BaseParser): - _subheadline_selector = XPath("//div[@id='p-article-block']/h2") - _paragraph_selector = XPath("//div[@id='p-article-block']//p[not(@class)]") - - @attribute - def body(self) -> Optional[ArticleBody]: - return extract_article_body_with_selector( - self.precomputed.doc, - paragraph_selector=self._paragraph_selector, - subheadline_selector=self._subheadline_selector, - ) - - @attribute - def title(self) -> Optional[str]: - return self.precomputed.meta.get("og:title") - - @attribute - def publishing_date(self) -> Optional[datetime.datetime]: - return generic_date_parsing(self.precomputed.meta.get("article:published_time")) - - @attribute - def authors(self) -> List[str]: - return generic_author_parsing(self.precomputed.ld.bf_search("author")) - - @attribute - def topics(self) -> List[str]: - return [ - re.sub(r"\([0-9]+\)", "", topic).strip() - for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) - ] diff --git a/src/fundus/publishers/jp/yomiuri_shimbun.py b/src/fundus/publishers/jp/yomiuri_shimbun.py index 200c2fce5..b3fa47976 100644 --- a/src/fundus/publishers/jp/yomiuri_shimbun.py +++ b/src/fundus/publishers/jp/yomiuri_shimbun.py @@ -4,12 +4,13 @@ from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -17,7 +18,7 @@ class YomiuriShimbunParser(ParserProxy): class V1(BaseParser): _paragraph_selector = XPath("//div[@class='p-main-contents ']/p") - _topic_selector = XPath("//div[@class='p-related-tags']/ul/li/a") + _topic_selector = XPath("//div[contains(@class,'p-related-tags')]/ul/li/a") @attribute def body(self) -> Optional[ArticleBody]: @@ -41,3 +42,14 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return [node.text_content() for node in self._topic_selector(self.precomputed.doc)] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@class='p-main-contents ']//img"), + upper_boundary_selector=XPath("//article"), + relative_urls=True, + author_selector=re.compile(r"(?P=.*)"), + ) diff --git a/src/fundus/publishers/lt/lrt.py b/src/fundus/publishers/lt/lrt.py index 4f266eb68..ba74f0c90 100644 --- a/src/fundus/publishers/lt/lrt.py +++ b/src/fundus/publishers/lt/lrt.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -48,3 +50,17 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article"), + image_selector=CSSSelector("div.media-block img"), + caption_selector=XPath( + "./ancestor::div[contains(@class, 'media-block')]//div[contains(@class, 'description')]" + ), + author_selector=re.compile(r"/\s*(?P.*).$"), + relative_urls=True, + ) diff --git a/src/fundus/publishers/my/malay_mail.py b/src/fundus/publishers/my/malay_mail.py index 478192045..ac6240a80 100644 --- a/src/fundus/publishers/my/malay_mail.py +++ b/src/fundus/publishers/my/malay_mail.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -41,3 +43,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[contains(@class, 'image')]//img"), + caption_selector=XPath("(./ancestor::div[contains(@class, 'image')])[1]//div[@class='image-caption']"), + author_selector=re.compile(r"\s*—\s*(?P.*)$"), + ) diff --git a/src/fundus/publishers/na/the_namibian.py b/src/fundus/publishers/na/the_namibian.py index 65513167a..1c940f9c1 100644 --- a/src/fundus/publishers/na/the_namibian.py +++ b/src/fundus/publishers/na/the_namibian.py @@ -6,10 +6,12 @@ from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -45,6 +47,14 @@ def title(self) -> Optional[str]: def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.ld.xpath_search(self._author_selector)) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//h1[@class='tdb-title-text']"), + ) + class V1_1(V1): VALID_UNTIL = datetime.today().date() _paragraph_selector = XPath("//div[contains(@class, 'entry-content')]/p[(text() or strong) and position()>1]") @@ -59,3 +69,11 @@ def body(self) -> Optional[ArticleBody]: paragraph_selector=self._paragraph_selector, summary_selector=self._summary_selector, ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//main"), + ) diff --git a/src/fundus/publishers/no/dagbladet.py b/src/fundus/publishers/no/dagbladet.py index f792cb93c..b13c79217 100644 --- a/src/fundus/publishers/no/dagbladet.py +++ b/src/fundus/publishers/no/dagbladet.py @@ -1,14 +1,18 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_date_parsing, generic_nodes_to_text, generic_topic_parsing, + image_extraction, ) @@ -44,3 +48,12 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=re.compile(r"Foto:(?P.*)"), + image_selector=XPath("//figure[contains(@class, 'image')]//img"), + ) diff --git a/src/fundus/publishers/no/nettavisen.py b/src/fundus/publishers/no/nettavisen.py index ba8104914..19b85998c 100644 --- a/src/fundus/publishers/no/nettavisen.py +++ b/src/fundus/publishers/no/nettavisen.py @@ -1,15 +1,18 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -48,3 +51,17 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + author_pattern = r"(Foto:\s*).*$" + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//brick-image-v3 | //img"), + caption_selector=XPath("./ancestor::div[contains(@class, 'image')]//span[1]"), + author_selector=XPath( + f"re:match(./ancestor::div[contains(@class, 'image')]//span[2], '{author_pattern}')", + namespaces={"re": "http://exslt.org/regular-expressions"}, + ), + ) diff --git a/src/fundus/publishers/no/nrk.py b/src/fundus/publishers/no/nrk.py index b99ef98f6..d4d500958 100644 --- a/src/fundus/publishers/no/nrk.py +++ b/src/fundus/publishers/no/nrk.py @@ -2,12 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +40,13 @@ def publishing_date(self) -> Optional[datetime.datetime]: @attribute def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=XPath("./ancestor::figure/figcaption/small"), + caption_selector=XPath("./ancestor::figure/figcaption/p"), + upper_boundary_selector=CSSSelector("header.article-header"), + ) diff --git a/src/fundus/publishers/no/verdensgang.py b/src/fundus/publishers/no/verdensgang.py index a14568741..d8690dae7 100644 --- a/src/fundus/publishers/no/verdensgang.py +++ b/src/fundus/publishers/no/verdensgang.py @@ -1,15 +1,18 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.data import Image from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -54,3 +57,11 @@ def topics(self) -> List[str]: @attribute def free_access(self) -> bool: return not self._paywall_selector(self.precomputed.doc) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=re.compile(r"Foto:(?P.*)"), + ) diff --git a/src/fundus/publishers/shared/euronews.py b/src/fundus/publishers/shared/euronews.py index 91a1e6921..a45d64b4d 100644 --- a/src/fundus/publishers/shared/euronews.py +++ b/src/fundus/publishers/shared/euronews.py @@ -1,9 +1,19 @@ +import re from datetime import datetime from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility +from fundus.parser import ( + ArticleBody, + BaseParser, + Image, + ParserProxy, + attribute, + utility, +) +from fundus.parser.utility import image_extraction class EuronewsParser(ParserProxy): @@ -40,3 +50,23 @@ def publishing_date(self) -> Optional[datetime]: def topics(self) -> List[str]: keyword_string = self.precomputed.meta.get("keywords") return utility.generic_topic_parsing(keyword_string) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//img[contains(@class, 'c-article-media__img')" " or contains(@class, 'widgetImage__image')]" + ), + caption_selector=XPath( + "./ancestor::div[contains(@class, 'c-article-image-video')]" + "//div[contains(@class, 'c-article-caption__content')]|" + "./ancestor::figure//span[@class='widget__captionText']" + ), + author_selector=XPath( + "./ancestor::div[contains(@class, 'c-article-image-video')]" + "//div[contains(@class, 'c-article-image-copyright')]|" + "./ancestor::figure//span[@class='widget__captionCredit']" + ), + ) diff --git a/src/fundus/publishers/tr/haberturk.py b/src/fundus/publishers/tr/haberturk.py index 114c16758..753dd6562 100644 --- a/src/fundus/publishers/tr/haberturk.py +++ b/src/fundus/publishers/tr/haberturk.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -49,3 +50,12 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article"), + image_selector=CSSSelector("img[data-zoomable]"), + ) diff --git a/src/fundus/publishers/tr/ntvtr.py b/src/fundus/publishers/tr/ntvtr.py index 1dbca5e58..e4a86f81b 100644 --- a/src/fundus/publishers/tr/ntvtr.py +++ b/src/fundus/publishers/tr/ntvtr.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -40,3 +42,13 @@ def topics(self) -> List[str]: @attribute def authors(self) -> List[str]: return generic_author_parsing(self.precomputed.meta.get("articleAuthor")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("article, div.category-detail-inner"), + lower_boundary_selector=CSSSelector("div.social:last-of-type"), + image_selector=XPath("//div[contains(@class, 'img-wrapper')]//img | //picture /img"), + ) diff --git a/src/fundus/publishers/uk/daily_mail.py b/src/fundus/publishers/uk/daily_mail.py index 8044f36b4..c0a13732b 100644 --- a/src/fundus/publishers/uk/daily_mail.py +++ b/src/fundus/publishers/uk/daily_mail.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -43,3 +45,13 @@ def topics(self) -> List[str]: if topic.casefold() != topic: filtered_topics.append(topic) return filtered_topics + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("div#content"), + image_selector=CSSSelector("div.mol-img-group img"), + caption_selector=XPath("./ancestor::div[contains(@class, 'mol-img-group')]/p[@class='imageCaption']"), + ) diff --git a/src/fundus/publishers/uk/daily_star.py b/src/fundus/publishers/uk/daily_star.py index dbfc5e10a..14de27cb0 100644 --- a/src/fundus/publishers/uk/daily_star.py +++ b/src/fundus/publishers/uk/daily_star.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -41,3 +42,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=CSSSelector("figure.in-article-image img"), + caption_selector=XPath("./ancestor::figure//figcaption/span[@class='caption']"), + author_selector=XPath("./ancestor::figure//figcaption/span[@class='credit']"), + ) diff --git a/src/fundus/publishers/uk/evening_standard.py b/src/fundus/publishers/uk/evening_standard.py index 7e07a5a3a..9702138e2 100644 --- a/src/fundus/publishers/uk/evening_standard.py +++ b/src/fundus/publishers/uk/evening_standard.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -42,6 +44,24 @@ def authors(self) -> List[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//picture[not(ancestor::a)] /img"), + upper_boundary_selector=CSSSelector("article"), + caption_selector=XPath( + "./ancestor::div[count(div)=3 and position() <= 2]/div[2] |" + "./ancestor::div[picture and count(div)=2][1]/div[1]" + ), + author_selector=XPath( + "./ancestor::div[count(div)=3 and position() <= 2]/div[3] |" + "./ancestor::div[picture and count(div)=2][1]/div[2]" + ), + lower_boundary_selector=CSSSelector("div#piano-reg-wall"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() _summary_selector = CSSSelector("div.sc-jgyXzG") diff --git a/src/fundus/publishers/uk/express.py b/src/fundus/publishers/uk/express.py index a96e103d3..495f2d9e4 100644 --- a/src/fundus/publishers/uk/express.py +++ b/src/fundus/publishers/uk/express.py @@ -1,14 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -45,3 +48,17 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("div[role=main] article"), + image_selector=CSSSelector("div.photo img"), + caption_selector=XPath("./ancestor::div[contains(@class, 'photo')]/span[@class='newsCaption']/text()"), + author_selector=XPath( + "./ancestor::div[contains(@class, 'photo')]/span[@class='newsCaption']/span[@class='caption']" + ), + size_pattern=re.compile(r"/(?P[0-9]+)x(?P[0-9]+)?/"), + ) diff --git a/src/fundus/publishers/uk/i_news.py b/src/fundus/publishers/uk/i_news.py index 16a4cdfb9..d69e93fb7 100644 --- a/src/fundus/publishers/uk/i_news.py +++ b/src/fundus/publishers/uk/i_news.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -41,3 +43,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=CSSSelector("div.inews__main"), + image_selector=CSSSelector("figure:has(> figcaption) img"), + author_selector=re.compile(r"\((?P.*?)\)$"), + ) diff --git a/src/fundus/publishers/uk/metro.py b/src/fundus/publishers/uk/metro.py index be8ce3ac7..bc49f335e 100644 --- a/src/fundus/publishers/uk/metro.py +++ b/src/fundus/publishers/uk/metro.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional, Union from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -65,6 +67,15 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + author_selector=re.compile(r"(?P\([^(]+\)$)"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() _summary_selector = XPath("//article//div[@class='article__content__inner']/p[1]") diff --git a/src/fundus/publishers/uk/the_bbc.py b/src/fundus/publishers/uk/the_bbc.py index 7a2127e9c..8a5ea9a03 100644 --- a/src/fundus/publishers/uk/the_bbc.py +++ b/src/fundus/publishers/uk/the_bbc.py @@ -4,11 +4,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, normalize_whitespace, ) @@ -52,3 +53,13 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: topic_nodes = self._topic_selector(self.precomputed.doc) return [normalize_whitespace(node.text_content()) for node in topic_nodes] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//figure //img[not(@src='/bbcx/grey-placeholder.png')]"), + caption_selector=XPath("./ancestor::figure//figcaption//p"), + author_selector=XPath("./ancestor::figure//span[@role='text']/text()"), + ) diff --git a/src/fundus/publishers/uk/the_guardian.py b/src/fundus/publishers/uk/the_guardian.py index 9806ef5c7..f3c725107 100644 --- a/src/fundus/publishers/uk/the_guardian.py +++ b/src/fundus/publishers/uk/the_guardian.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -40,3 +42,12 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("(./ancestor::figure//figcaption)[1]/span[2]"), + author_selector=XPath("(./ancestor::figure//figcaption)[1]/text()"), + ) diff --git a/src/fundus/publishers/uk/the_independent.py b/src/fundus/publishers/uk/the_independent.py index 0571fd3c9..1fe74ef36 100644 --- a/src/fundus/publishers/uk/the_independent.py +++ b/src/fundus/publishers/uk/the_independent.py @@ -1,14 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -36,3 +38,13 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=CSSSelector("figure > div > img, div[data-gallery-length] > img"), + upper_boundary_selector=CSSSelector("article"), + author_selector=re.compile(r"(?P(\([^)]*\)\s?)+$)"), + ) diff --git a/src/fundus/publishers/uk/the_mirror.py b/src/fundus/publishers/uk/the_mirror.py index 6d7a84249..53f029fd7 100644 --- a/src/fundus/publishers/uk/the_mirror.py +++ b/src/fundus/publishers/uk/the_mirror.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -50,6 +51,21 @@ def authors(self) -> List[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=CSSSelector("div.image > img, div.image-container amp-img"), + caption_selector=XPath( + "./ancestor::div[@class='lead-content' or @class='image-container']//figcaption//span[1]" + ), + author_selector=XPath( + "./ancestor::div[@class='lead-content' or @class='image-container']//figcaption//span[2]" + ), + lower_boundary_selector=CSSSelector("reach-viafoura-comments"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/uk/the_sun.py b/src/fundus/publishers/uk/the_sun.py index ade88652c..9b9dcd7a4 100644 --- a/src/fundus/publishers/uk/the_sun.py +++ b/src/fundus/publishers/uk/the_sun.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -43,3 +44,11 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("article:tag")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/span[1]"), + ) diff --git a/src/fundus/publishers/uk/the_telegraph.py b/src/fundus/publishers/uk/the_telegraph.py index f45cbcca9..e8d537d25 100644 --- a/src/fundus/publishers/uk/the_telegraph.py +++ b/src/fundus/publishers/uk/the_telegraph.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -49,6 +51,15 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("keywords")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption/span[1]"), + relative_urls=True, + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/us/ap_news.py b/src/fundus/publishers/us/ap_news.py index 33730e369..d6c7f05f6 100644 --- a/src/fundus/publishers/us/ap_news.py +++ b/src/fundus/publishers/us/ap_news.py @@ -5,12 +5,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, normalize_whitespace, ) @@ -62,6 +63,8 @@ def topics(self) -> List[str]: if not re.search(self._topic_bloat_pattern, topic) ] + # unfortunately we would need to render the site first before parsing images for this version + class V1_1(V1): VALID_UNTIL = datetime.date.today() @@ -72,3 +75,18 @@ class V1_1(V1): "/p[not(preceding-sibling::*[1][self::h2 and text()='___'])]" # only p-elements not directly following h2 elements with text() = '___' ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//*[self::figure or @class='CarouselSlide']//img"), + caption_selector=XPath( + "./ancestor::figure//figcaption | " + "./ancestor::div[@class='CarouselSlide']//span[@class='CarouselSlide-infoDescription']" + ), + upper_boundary_selector=XPath("//div[@class='Page-content' or @class='Body']"), + lower_boundary_selector=CSSSelector("footer.Page-footer"), + author_selector=re.compile(r"\s*\((?P.*)\)$"), + ) diff --git a/src/fundus/publishers/us/business_insider.py b/src/fundus/publishers/us/business_insider.py index f36b1c0e4..4044d7803 100644 --- a/src/fundus/publishers/us/business_insider.py +++ b/src/fundus/publishers/us/business_insider.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -59,3 +60,14 @@ def topics(self) -> List[str]: or self.precomputed.ld.bf_search("keywords") or self.precomputed.meta.get("news_keywords") ) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//article"), + image_selector=XPath("//figure//img[not(@data-content-type)]"), + caption_selector=XPath("./ancestor::figure//figcaption/span[@class='image-caption-text']"), + author_selector=XPath("./ancestor::figure//figcaption/span[@class='image-source-text']"), + ) diff --git a/src/fundus/publishers/us/cnbc.py b/src/fundus/publishers/us/cnbc.py index 0418cb9fb..73f3d34b8 100644 --- a/src/fundus/publishers/us/cnbc.py +++ b/src/fundus/publishers/us/cnbc.py @@ -4,12 +4,13 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -47,3 +48,18 @@ def topics(self) -> List[str]: @attribute(validate=False) def key_points(self) -> List[str]: return [key_point.text_content() for key_point in self._key_points_selector(self.precomputed.doc)] + + """ + CNBC uses unconventional image loading, which is not supported at the time + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//h1[@class='ArticleHeader-headline']"), + image_selector=XPath("//div[@class='InlineImage-wrapper']//img"), + caption_selector=XPath("./ancestor::div[@class='InlineImage-wrapper']//div[@class='InlineImage-imageEmbedCaption']"), + author_selector=XPath( + "./ancestor::div[@class='InlineImage-wrapper']//div[@class='InlineImage-imageEmbedCredit']") + ) + """ diff --git a/src/fundus/publishers/us/fox_news.py b/src/fundus/publishers/us/fox_news.py index f01a879bf..1108a2435 100644 --- a/src/fundus/publishers/us/fox_news.py +++ b/src/fundus/publishers/us/fox_news.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -43,3 +45,15 @@ def title(self) -> Optional[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("classification-tags")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//article//picture//img[not(@*[starts-with(name(), 'data-v-')])]"), + caption_selector=XPath("(./ancestor::div[@class='image-ct inline']//div[@class='caption']/p/span)[1]"), + author_selector=XPath( + "(./ancestor::div[@class='image-ct inline']//div[@class='caption']/p/span)[last()]" + ), + ) diff --git a/src/fundus/publishers/us/free_beacon.py b/src/fundus/publishers/us/free_beacon.py index 3b140a7f2..61fbd364c 100644 --- a/src/fundus/publishers/us/free_beacon.py +++ b/src/fundus/publishers/us/free_beacon.py @@ -1,13 +1,16 @@ +import re from datetime import datetime from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -40,3 +43,11 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: topics: Optional[List[str]] = self.precomputed.ld.bf_search("keywords") return topics if topics else [] + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=re.compile(r"\((?P.+)\)$"), + ) diff --git a/src/fundus/publishers/us/la_times.py b/src/fundus/publishers/us/la_times.py index abaed51ae..544387249 100644 --- a/src/fundus/publishers/us/la_times.py +++ b/src/fundus/publishers/us/la_times.py @@ -1,13 +1,16 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -37,3 +40,13 @@ def authors(self) -> List[str]: @attribute def title(self) -> Optional[str]: return self.precomputed.meta.get("og:title") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//div[@class='page-lead']|//h1[@class='headline']"), + caption_selector=XPath("./ancestor::figure//div[@class='figure-caption']"), + author_selector=XPath("./ancestor::figure//div[@class='figure-credit']"), + ) diff --git a/src/fundus/publishers/us/rolling_stone.py b/src/fundus/publishers/us/rolling_stone.py index c6b83b0b1..08ca98e02 100644 --- a/src/fundus/publishers/us/rolling_stone.py +++ b/src/fundus/publishers/us/rolling_stone.py @@ -2,13 +2,15 @@ from typing import List, Optional from lxml.cssselect import CSSSelector +from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, parse_title_from_root, ) @@ -46,6 +48,15 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.meta.get("swiftype:topics")) + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + caption_selector=XPath("./ancestor::figure//figcaption//span"), + author_selector=XPath("./ancestor::figure//figcaption//cite"), + ) + class V1_1(V1): VALID_UNTIL = datetime.date.today() diff --git a/src/fundus/publishers/us/techcrunch.py b/src/fundus/publishers/us/techcrunch.py index 9867c2c81..63ebdc371 100644 --- a/src/fundus/publishers/us/techcrunch.py +++ b/src/fundus/publishers/us/techcrunch.py @@ -1,15 +1,17 @@ import datetime +import re from typing import List, Optional from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -78,3 +80,17 @@ def authors(self) -> List[str]: @attribute def topics(self) -> List[str]: return generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath( + "//div[@class='article-hero__first-section']|" + "//div[@class='is-floating wp-block-techcrunch-social-share']|" + "//h1[@class='wp-block-post-title']" + ), + caption_selector=XPath("./ancestor::figure//figcaption"), + author_selector=re.compile(r"(?i)image credits:(?P.*)"), + ) diff --git a/src/fundus/publishers/us/the_gateway_pundit.py b/src/fundus/publishers/us/the_gateway_pundit.py index 133769091..82a8c6068 100644 --- a/src/fundus/publishers/us/the_gateway_pundit.py +++ b/src/fundus/publishers/us/the_gateway_pundit.py @@ -1,13 +1,15 @@ +import re from datetime import datetime from typing import List, Optional from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -40,3 +42,12 @@ def title(self) -> Optional[str]: if (title := self.precomputed.meta.get("og:title")) is not None: title = title.split("|")[0].strip() return title + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath("//div[@class='entry-content']//img"), + author_selector=XPath("./ancestor::figure//figcaption"), + ) diff --git a/src/fundus/publishers/us/the_intercept.py b/src/fundus/publishers/us/the_intercept.py index 4906e2050..6441ea6a5 100644 --- a/src/fundus/publishers/us/the_intercept.py +++ b/src/fundus/publishers/us/the_intercept.py @@ -4,11 +4,12 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, + image_extraction, ) @@ -63,3 +64,21 @@ class V1_1(V1): ) _paragraph_selector = CSSSelector("div.entry-content > div.entry-content__content > p, blockquote > p") _subheadline_selector = CSSSelector("div.entry-content > div.entry-content__content > h2") + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + image_selector=XPath( + "//img[(string-length(@alt) > 0 and not(contains(@class, 'attachment') or contains(@class, ':hidden'))) or @loading='eager']|//figure//img" + ), + caption_selector=XPath( + "(./parent::article//div[contains(@class, 'image__caption')]/span[not(@class)])[1]|" + "./ancestor::figure//figcaption/span[@class='photo__caption']" + ), + author_selector=XPath( + "(./parent::article//div[contains(@class, 'image__caption')]/span)[last()]|" + "./ancestor::figure//figcaption/span[@class='photo__credit']" + ), + ) diff --git a/src/fundus/publishers/us/the_nation.py b/src/fundus/publishers/us/the_nation.py index e069e85fc..09529553f 100644 --- a/src/fundus/publishers/us/the_nation.py +++ b/src/fundus/publishers/us/the_nation.py @@ -6,12 +6,20 @@ from lxml.cssselect import CSSSelector from lxml.etree import XPath -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, function +from fundus.parser import ( + ArticleBody, + BaseParser, + Image, + ParserProxy, + attribute, + function, +) from fundus.parser.utility import ( extract_article_body_with_selector, generic_author_parsing, generic_date_parsing, generic_topic_parsing, + image_extraction, ) @@ -48,7 +56,7 @@ def _fix_malformed_html(self) -> None: # within .article-header-content. As a result,