2018-05-10 19:13:36 +02:00
|
|
|
from typing import Dict
|
2016-10-27 12:06:44 +02:00
|
|
|
from .base import BaseParser
|
|
|
|
|
|
|
|
|
|
|
|
class OpenGraphParser(BaseParser):
|
2019-12-12 02:10:50 +01:00
|
|
|
allowed_og_properties = {
|
|
|
|
'og:title',
|
|
|
|
'og:description',
|
|
|
|
'og:image',
|
|
|
|
}
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def extract_data(self) -> Dict[str, str]:
|
2016-10-27 12:06:44 +02:00
|
|
|
meta = self._soup.findAll('meta')
|
2019-12-12 02:10:50 +01:00
|
|
|
result = {}
|
2016-10-27 12:06:44 +02:00
|
|
|
for tag in meta:
|
2019-12-12 02:10:50 +01:00
|
|
|
if not tag.has_attr('property'):
|
|
|
|
continue
|
|
|
|
if tag['property'] not in self.allowed_og_properties:
|
|
|
|
continue
|
|
|
|
|
|
|
|
og_property_name = tag['property'][len('og:'):]
|
|
|
|
if not tag.has_attr('content'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
result[og_property_name] = tag['content']
|
|
|
|
|
|
|
|
return result
|