Improve geo bypass mechanism
authorSergey M․ <dstftw@gmail.com>
Wed, 2 May 2018 00:18:01 +0000 (07:18 +0700)
committerSergey M․ <dstftw@gmail.com>
Wed, 2 May 2018 00:20:59 +0000 (07:20 +0700)
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/extractor/anvato.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/common.py
youtube_dl/extractor/dplay.py
youtube_dl/extractor/go.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/tvplay.py
youtube_dl/options.py
youtube_dl/utils.py

index ad359880526116d71d13fa2a736ab02ca79e19c0..f1a35901159f89ada4aaa8b3f23135b1e1e65f80 100755 (executable)
@@ -286,6 +286,9 @@ class YoutubeDL(object):
                        Two-letter ISO 3166-2 country code that will be used for
                        explicit geographic restriction bypassing via faking
                        X-Forwarded-For HTTP header (experimental)
+    geo_bypass_ip_block:
+                       IP range in CIDR notation that will be used similarly to
+                       geo_bypass_country (experimental)
 
     The following options determine which downloader is picked:
     external_downloader: Executable of the external downloader to call.
index 9bb952457e149f3687efd0ca925aed8d69996366..ba435ea428c57c933a5ea113dd58f38bbf7e2cd4 100644 (file)
@@ -430,6 +430,7 @@ def _real_main(argv=None):
         'config_location': opts.config_location,
         'geo_bypass': opts.geo_bypass,
         'geo_bypass_country': opts.geo_bypass_country,
+        'geo_bypass_ip_block': opts.geo_bypass_ip_block,
         # just for deprecation check
         'autonumber': opts.autonumber if opts.autonumber is True else None,
         'usetitle': opts.usetitle if opts.usetitle is True else None,
index 7a29cd2c6315fad6caa05ac6a499f07f80a41566..f6a78eb5d4cb1288d3bc2b864303343b24603422 100644 (file)
@@ -277,7 +277,9 @@ class AnvatoIE(InfoExtractor):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         mobj = re.match(self._VALID_URL, url)
         access_key, video_id = mobj.group('access_key_or_mcp', 'id')
index 0e4eaef659105df0248d6ef5171f5f444660f1ee..ab62e54d63c2335d0002701c078105e00ccc6186 100644 (file)
@@ -669,7 +669,10 @@ class BrightcoveNewIE(AdobePassIE):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+            'ip_blocks': smuggled_data.get('geo_ip_blocks'),
+        })
 
         account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
 
index a9939b0fdc817c42e4c09999560dfd8c76873a5c..3ef5af13c415205540fed3dd4a02bf6b6dfb1822 100644 (file)
@@ -346,6 +346,11 @@ class InfoExtractor(object):
     geo restriction bypass mechanism right away in order to bypass
     geo restriction, of course, if the mechanism is not disabled. (experimental)
 
+    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
+    IP blocks in CIDR notation for this extractor. One of these IP blocks
+    will be used by geo restriction bypass mechanism similarly
+    to _GEO_COUNTRIES. (experimental)
+
     NB: both these geo attributes are experimental and may change in future
     or be completely removed.
 
@@ -358,6 +363,7 @@ class InfoExtractor(object):
     _x_forwarded_for_ip = None
     _GEO_BYPASS = True
     _GEO_COUNTRIES = None
+    _GEO_IP_BLOCKS = None
     _WORKING = True
 
     def __init__(self, downloader=None):
@@ -392,12 +398,15 @@ class InfoExtractor(object):
 
     def initialize(self):
         """Initializes an instance (authentication, etc)."""
-        self._initialize_geo_bypass(self._GEO_COUNTRIES)
+        self._initialize_geo_bypass({
+            'countries': self._GEO_COUNTRIES,
+            'ip_blocks': self._GEO_IP_BLOCKS,
+        })
         if not self._ready:
             self._real_initialize()
             self._ready = True
 
-    def _initialize_geo_bypass(self, countries):
+    def _initialize_geo_bypass(self, geo_bypass_context):
         """
         Initialize geo restriction bypass mechanism.
 
@@ -408,28 +417,82 @@ class InfoExtractor(object):
         HTTP requests.
 
         This method will be used for initial geo bypass mechanism initialization
-        during the instance initialization with _GEO_COUNTRIES.
+        during the instance initialization with _GEO_COUNTRIES and
+        _GEO_IP_BLOCKS.
 
-        You may also manually call it from extractor's code if geo countries
+        You may also manually call it from extractor's code if geo bypass
         information is not available beforehand (e.g. obtained during
-        extraction) or due to some another reason.
+        extraction) or due to some other reason. In this case you should pass
+        this information in geo bypass context passed as first argument. It may
+        contain following fields:
+
+        countries:  List of geo unrestricted countries (similar
+                    to _GEO_COUNTRIES)
+        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
+                    (similar to _GEO_IP_BLOCKS)
+
         """
         if not self._x_forwarded_for_ip:
-            country_code = self._downloader.params.get('geo_bypass_country', None)
-            # If there is no explicit country for geo bypass specified and
-            # the extractor is known to be geo restricted let's fake IP
-            # as X-Forwarded-For right away.
-            if (not country_code and
-                    self._GEO_BYPASS and
-                    self._downloader.params.get('geo_bypass', True) and
-                    countries):
-                country_code = random.choice(countries)
-            if country_code:
-                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+
+            # Geo bypass mechanism is explicitly disabled by user
+            if not self._downloader.params.get('geo_bypass', True):
+                return
+
+            if not geo_bypass_context:
+                geo_bypass_context = {}
+
+            # Backward compatibility: previously _initialize_geo_bypass
+            # expected a list of countries, some 3rd party code may still use
+            # it this way
+            if isinstance(geo_bypass_context, (list, tuple)):
+                geo_bypass_context = {
+                    'countries': geo_bypass_context,
+                }
+
+            # The whole point of geo bypass mechanism is to fake IP
+            # as X-Forwarded-For HTTP header based on some IP block or
+            # country code.
+
+            # Path 1: bypassing based on IP block in CIDR notation
+
+            # Explicit IP block specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+
+            # Otherwise use random IP block from geo bypass context but only
+            # if extractor is known as geo bypassable
+            if not ip_block:
+                ip_blocks = geo_bypass_context.get('ip_blocks')
+                if self._GEO_BYPASS and ip_blocks:
+                    ip_block = random.choice(ip_blocks)
+
+            if ip_block:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
+                if self._downloader.params.get('verbose', False):
+                    self._downloader.to_screen(
+                        '[debug] Using fake IP %s as X-Forwarded-For.'
+                        % self._x_forwarded_for_ip)
+                return
+
+            # Path 2: bypassing based on country code
+
+            # Explicit country code specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            country = self._downloader.params.get('geo_bypass_country', None)
+
+            # Otherwise use random country code from geo bypass context but
+            # only if extractor is known as geo bypassable
+            if not country:
+                countries = geo_bypass_context.get('countries')
+                if self._GEO_BYPASS and countries:
+                    country = random.choice(countries)
+
+            if country:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
                 if self._downloader.params.get('verbose', False):
                     self._downloader.to_screen(
                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
-                        % (self._x_forwarded_for_ip, country_code.upper()))
+                        % (self._x_forwarded_for_ip, country.upper()))
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
index b734467734c30a880badab5ff72cfac7543b3b56..8e0374320783195c810d1be8fff7d44b95b32414 100644 (file)
@@ -102,7 +102,9 @@ class DPlayIE(InfoExtractor):
         display_id = mobj.group('id')
         domain = mobj.group('domain')
 
-        self._initialize_geo_bypass([mobj.group('country').upper()])
+        self._initialize_geo_bypass({
+            'countries': [mobj.group('country').upper()],
+        })
 
         webpage = self._download_webpage(url, display_id)
 
index 9c7b1bd37d447c5a3cb09aa6b3418b63accffcad..e781405f2f7d55aa51f8cc50cc99df22ee5dff40 100644 (file)
@@ -123,7 +123,7 @@ class GoIE(AdobePassIE):
                         'adobe_requestor_id': requestor_id,
                     })
                 else:
-                    self._initialize_geo_bypass(['US'])
+                    self._initialize_geo_bypass({'countries': ['US']})
                 entitlement = self._download_json(
                     'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
                     video_id, data=urlencode_postdata(data))
index 2803d7e8df47c92003ccd11a23c951af1644a7a1..729d8de50fab70cd69bab41fae9db0cba4d7da9b 100644 (file)
@@ -282,7 +282,9 @@ class LimelightMediaIE(LimelightBaseIE):
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         pc, mobile, metadata = self._extract(
             video_id, 'getPlaylistByMediaId',
index 84597b55e0f6047a1dccd1905cb4771949b3cf00..e09b5f804d897954f4488344d27beaa8a7a2eea6 100644 (file)
@@ -227,14 +227,16 @@ class TVPlayIE(InfoExtractor):
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
-        self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
 
         video_id = self._match_id(url)
         geo_country = self._search_regex(
             r'https?://[^/]+\.([a-z]{2})', url,
             'geo country', default=None)
         if geo_country:
-            self._initialize_geo_bypass([geo_country.upper()])
+            self._initialize_geo_bypass({'countries': [geo_country.upper()]})
         video = self._download_json(
             'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
 
index 3e4ac03a240844ef3b69fd21dbedcc441a46e5c7..f3f8f23b6b8b623194fe4250a53bcf45c9d326c8 100644 (file)
@@ -249,6 +249,10 @@ def parseOpts(overrideArguments=None):
         '--geo-bypass-country', metavar='CODE',
         dest='geo_bypass_country', default=None,
         help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)')
+    geo.add_option(
+        '--geo-bypass-ip-block', metavar='IP_BLOCK',
+        dest='geo_bypass_ip_block', default=None,
+        help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation (experimental)')
 
     selection = optparse.OptionGroup(parser, 'Video Selection')
     selection.add_option(
index b460393bf37106a47304dfa97dd7b1efa37117cd..f9ca63c589ffb56fb2c18f76ee40a99795265f0a 100644 (file)
@@ -3534,10 +3534,13 @@ class GeoUtils(object):
     }
 
     @classmethod
-    def random_ipv4(cls, code):
-        block = cls._country_ip_map.get(code.upper())
-        if not block:
-            return None
+    def random_ipv4(cls, code_or_block):
+        if len(code_or_block) == 2:
+            block = cls._country_ip_map.get(code_or_block.upper())
+            if not block:
+                return None
+        else:
+            block = code_or_block
         addr, preflen = block.split('/')
         addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
         addr_max = addr_min | (0xffffffff >> int(preflen))