yt_dlp/extractor/rudovideo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    determine_ext,
    js_to_json,
    traverse_obj,
    update_url_query,
    url_or_none,
)


class RudoVideoIE(InfoExtractor):
    _VALID_URL = r'https?://rudo\.video/(?P<type>vod|podcast|live)/(?P<id>[^/?&#]+)'
    _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)//rudo\.video/(?:vod|podcast|live)/[^\'"]+)']
    _TESTS = [{
        'url': 'https://rudo.video/podcast/cz2wrUy8l0o',
        'md5': '28ed82b477708dc5e12e072da2449221',
        'info_dict': {
            'id': 'cz2wrUy8l0o',
            'title': 'Diego Cabot',
            'ext': 'mp4',
            'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
        },
    }, {
        'url': 'https://rudo.video/podcast/bQkt07',
        'md5': '36b22a9863de0f47f00fc7532a32a898',
        'info_dict': {
            'id': 'bQkt07',
            'title': 'Tubular Bells',
            'ext': 'mp4',
            'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
        },
    }, {
        'url': 'https://rudo.video/podcast/b42ZUznHX0',
        'md5': 'b91c70d832938871367f8ad10c895821',
        'info_dict': {
            'id': 'b42ZUznHX0',
            'title': 'Columna Ruperto Concha',
            'ext': 'mp3',
            'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
        },
    }, {
        'url': 'https://rudo.video/vod/bN5AaJ',
        'md5': '01324a329227e2591530ecb4f555c881',
        'info_dict': {
            'id': 'bN5AaJ',
            'title': 'Ucrania 19.03',
            'creator': 'La Tercera',
            'ext': 'mp4',
            'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
        },
    }, {
        'url': 'https://rudo.video/live/bbtv',
        'info_dict': {
            'id': 'bbtv',
            'ext': 'mp4',
            'creator': 'BioBioTV',
            'live_status': 'is_live',
            'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$',
            'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
        },
    }, {
        'url': 'https://rudo.video/live/c13',
        'info_dict': {
            'id': 'c13',
            'title': 'CANAL13',
            'ext': 'mp4',
        },
        'skip': 'Geo-restricted to Chile',
    }, {
        'url': 'https://rudo.video/live/t13-13cl',
        'info_dict': {
            'id': 't13-13cl',
            'title': 'T13',
            'ext': 'mp4',
        },
        'skip': 'Geo-restricted to Chile',
    }]

    def _real_extract(self, url):
        video_id, type_ = self._match_valid_url(url).group('id', 'type')
        is_live = type_ == 'live'

        webpage = self._download_webpage(url, video_id)
        if 'Streaming is not available in your area' in webpage:
            self.raise_geo_restricted()

        media_url = (
            self._search_regex(
                r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None)
            # Source URL must be used only if streamURL is unavailable
            or self._search_regex(
                r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None))
        if not media_url:
            youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)',
                                             webpage, 'youtube url', default=None)
            if youtube_url:
                return self.url_result(youtube_url, 'Youtube')
            raise ExtractorError('Unable to extract stream url')

        token_array = self._search_json(
            r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=', webpage, 'access token array', video_id,
            contains_pattern=r'\[(?s:.+)\]', default=None, transform_source=js_to_json)
        if token_array:
            token_url = traverse_obj(token_array, (..., {url_or_none}), get_all=False)
            if not token_url:
                raise ExtractorError('Invalid access token array')
            access_token = self._download_json(
                token_url, video_id, note='Downloading access token')['data']['authToken']
            media_url = update_url_query(media_url, {'auth-token': access_token})

        ext = determine_ext(media_url)
        if ext == 'm3u8':
            formats = self._extract_m3u8_formats(media_url, video_id, live=is_live)
        elif ext == 'mp3':
            formats = [{
                'url': media_url,
                'vcodec': 'none',
            }]
        else:
            formats = [{'url': media_url}]

        return {
            'id': video_id,
            'title': (self._search_regex(r'var\s+titleVideo\s*=\s*[\'"]([^\'"]+)',
                                         webpage, 'title', default=None)
                      or self._og_search_title(webpage)),
            'creator': self._search_regex(r'var\s+videoAuthor\s*=\s*[\'"]([^?\'"]+)',
                                          webpage, 'videoAuthor', default=None),
            'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)',
                                             webpage, 'thumbnail', default=None)
                          or self._og_search_thumbnail(webpage)),
            'formats': formats,
            'is_live': is_live,
        }