2018-05-10 19:13:36 +02:00
|
|
|
from typing import Dict
|
2022-02-18 22:48:53 +01:00
|
|
|
from urllib.parse import urlparse
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2016-10-27 12:06:44 +02:00
|
|
|
from .base import BaseParser
|
|
|
|
|
|
|
|
|
|
|
|
class OpenGraphParser(BaseParser):
|
2019-12-12 02:10:50 +01:00
|
|
|
allowed_og_properties = {
|
2021-02-12 08:20:45 +01:00
|
|
|
"og:title",
|
|
|
|
"og:description",
|
|
|
|
"og:image",
|
2019-12-12 02:10:50 +01:00
|
|
|
}
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def extract_data(self) -> Dict[str, str]:
|
2021-02-12 08:20:45 +01:00
|
|
|
meta = self._soup.findAll("meta")
|
2019-12-12 02:10:50 +01:00
|
|
|
result = {}
|
2016-10-27 12:06:44 +02:00
|
|
|
for tag in meta:
|
2021-02-12 08:20:45 +01:00
|
|
|
if not tag.has_attr("property"):
|
2019-12-12 02:10:50 +01:00
|
|
|
continue
|
2021-02-12 08:20:45 +01:00
|
|
|
if tag["property"] not in self.allowed_og_properties:
|
2019-12-12 02:10:50 +01:00
|
|
|
continue
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
og_property_name = tag["property"][len("og:") :]
|
|
|
|
if not tag.has_attr("content"):
|
2019-12-12 02:10:50 +01:00
|
|
|
continue
|
|
|
|
|
2022-02-18 22:48:53 +01:00
|
|
|
if og_property_name == "image":
|
|
|
|
try:
|
|
|
|
# We use urlparse and not URLValidator because we
|
|
|
|
# need to support relative URLs.
|
|
|
|
urlparse(tag["content"])
|
|
|
|
except ValueError:
|
|
|
|
continue
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
result[og_property_name] = tag["content"]
|
2019-12-12 02:10:50 +01:00
|
|
|
|
|
|
|
return result
|