diff --git a/src/extractor/afreeca_downloader.py b/src/extractor/afreeca_downloader.py index 91bc6d3..46cbdee 100644 --- a/src/extractor/afreeca_downloader.py +++ b/src/extractor/afreeca_downloader.py @@ -28,6 +28,10 @@ class Downloader_afreeca(Downloader): single = True display_name = 'AfreecaTV' + @classmethod + def fix_url(cls, url): + return url.rstrip(' /') + def read(self): session = Session() video = get_video(self.url, session, self.cw) @@ -52,9 +56,6 @@ def _get_stream(url_m3u8): @try_n(8) def get_video(url, session, cw): print_ = get_print(cw) - while url.strip().endswith('/'): - url = url[:-1] - html = downloader.read_html(url, session=session) if "document.location.href='https://login." in html: raise errors.LoginRequired() diff --git a/src/extractor/navertoon_downloader.py b/src/extractor/navertoon_downloader.py index d5a5c9f..de7cec7 100644 --- a/src/extractor/navertoon_downloader.py +++ b/src/extractor/navertoon_downloader.py @@ -1,8 +1,3 @@ -# uncompyle6 version 3.5.0 -# Python bytecode 2.7 (62211) -# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)] -# Embedded file name: navertoon_downloader.pyo -# Compiled at: 2019-10-03 10:19:35 import downloader from utils import Soup, urljoin, Downloader, LazyUrl, get_imgs_already, clean_title, get_ext, get_print from constants import try_n @@ -47,9 +42,14 @@ class Downloader_navertoon(Downloader): display_name = 'Naver Webtoon' def init(self): - self.url = get_main(self.url) self.__info, _ = get_pages(self.url, self.cw) + @classmethod + def fix_url(cls, url): + url = re.sub(r'[?&]page=[0-9]+', '', re.sub(r'[?&]no=[0-9]+', '', url)).replace('m.comic.naver.', 'comic.naver.') + url = url.replace('detail.nhn', 'list.nhn').replace('/detail?', '/list?') + return url.rstrip('#') + @property def name(self): id = self.__info.id @@ -70,14 +70,6 @@ def read(self): self.title = self.name -def get_main(url): - url_main = re.sub('[?&]page=[0-9]+', '', re.sub('[?&]no=[0-9]+', '', url)).replace('detail.nhn', 'list.nhn').replace('m.comic.naver.', 'comic.naver.') - while url_main.endswith('#'): - url_main = url_main[:-1] - - return url_main - - def set_no(url, p): if '&no=' not in url: url = url + ('&no={}').format(p) @@ -101,7 +93,7 @@ def set_page(url, p): @try_n(4) def get_pages(url, cw=None): print_ = get_print(cw) - url = get_main(url).replace('comic.naver.', 'm.comic.naver.') + url = Downloader_navertoon.fix_url(url).replace('comic.naver.', 'm.comic.naver.') id = get_id(url) print('id:', id) print(url) @@ -119,7 +111,7 @@ def get_pages(url, cw=None): raise Exception(title) - print('artist:', artist) + print_('artist: {}'.format(artist)) title = soup.find('meta', {'property': 'og:title'}).attrs['content'] pages = [] nos = set() @@ -134,7 +126,7 @@ def get_pages(url, cw=None): view = soup.findAll('ul', class_='section_episode_list')[(-1)] for lst in view.findAll('li'): url_page = urljoin(url, lst.find('a').attrs['href']) - if 'detail.nhn' not in url_page.lower(): + if 'detail.nhn' not in url_page.lower() and 'detail?' not in url_page.lower(): #3540 continue print_('url_page: {}'.format(url_page)) text = lst.find('strong', class_='title').find('span', class_='name').text.strip() @@ -160,7 +152,7 @@ def get_pages(url, cw=None): @page_selector.register('navertoon') @try_n(4) def f(url): - url = get_main(url) + url = Downloader_navertoon.fix_url(url) info, pages = get_pages(url) return pages diff --git a/src/extractor/pixiv_downloader.py b/src/extractor/pixiv_downloader.py index 3ebddf9..688ab52 100644 --- a/src/extractor/pixiv_downloader.py +++ b/src/extractor/pixiv_downloader.py @@ -19,6 +19,7 @@ from locker import lock import threading from ratelimit import limits, sleep_and_retry +##import asyncio FORCE_LOGIN = True LIMIT = 48 for header in ['pixiv_illust', 'pixiv_bmk', 'pixiv_search', 'pixiv_following', 'pixiv_following_r18']: @@ -60,10 +61,16 @@ def key_id(cls, url): return url.replace('://www.', '://').replace('/en/', '/') def read(self): - info = get_info(self.url, self.cw) - for img in info['imgs']: - self.urls.append(img.url) - self.title = clean_title(info['title']) +## loop = asyncio.new_event_loop() +## asyncio.set_event_loop(loop) + try: + info = get_info(self.url, self.cw) + for img in info['imgs']: + self.urls.append(img.url) + self.title = clean_title(info['title']) + finally: +## loop.close() + pass class PixivAPIError(errors.LoginRequired): pass diff --git a/src/extractor/pornhub_downloader.py b/src/extractor/pornhub_downloader.py index 75df354..4486922 100644 --- a/src/extractor/pornhub_downloader.py +++ b/src/extractor/pornhub_downloader.py @@ -14,6 +14,7 @@ import utils from m3u8_tools import playlist2stream, M3u8_stream import ytdl +import errors @@ -53,6 +54,7 @@ class Video(object): thumb = None def __init__(self, url, cw, session): + url = Downloader_pornhub.fix_url(url) self.url = LazyUrl(url, self.get, self) self.cw = cw self.session = session @@ -68,11 +70,22 @@ def get(self, url): return self._url id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \ - re.find(r'/embed/(\w+)', url, re.IGNORECASE) - print('id: {}'.format(id_)) + re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id') + print_('id: {}'.format(id_)) if 'viewkey=' not in url.lower() and '/gif/' not in url.lower(): url = urljoin(url, '/view_video.php?viewkey={}'.format(id_)) - html = downloader.read_html(url, session=session) + + url_test = url.replace('pornhubpremium.com', 'pornhub.com') + try: + html = downloader.read_html(url_test, session=session) + soup = Soup(html) + if soup.find('div', id='lockedPlayer'): + print_('Locked player') + raise Exception('Locked player') + url = url_test + except: #3511 + url = url.replace('pornhub.com', 'pornhubpremium.com') + html = downloader.read_html(url, session=session) soup = Soup(html) soup = fix_soup(soup, url, session, cw) @@ -173,22 +186,30 @@ class Downloader_pornhub(Downloader): type = 'pornhub' single = True strip_header = False - URLS = ['pornhub.com', 'pornhubpremium.com'] + URLS = ['pornhub.com', 'pornhubpremium.com', 'pornhubthbh7ap3u.onion'] def init(self): self.session = Session() # 1791 - if 'pornhub_gif_' in self.url: - self.url = 'https://www.pornhub.com/gif/{}'.format( - self.url.replace('pornhub_gif_', '')) - elif 'pornhub_album_' in self.url: - self.url = 'https://www.pornhub.com/album/{}'.format( - self.url.replace('pornhub_album_', '')) - elif 'pornhub_' in self.url: - self.url = 'https://www.pornhub.com/view_video.php?viewkey={}'\ - .format(self.url.replace('pornhub_', '')) if 'pornhubpremium.com' in self.url.lower() and\ not is_login(self.session, self.cw): - return self.Invalid('[Pornhub] Login cookies required') + raise errors.LoginRequired() + + @classmethod + def fix_url(cls, url): + if 'pornhub_gif_' in url: + url = 'https://www.pornhub.com/gif/{}'.format( + url.replace('pornhub_gif_', '')) + elif 'pornhub_album_' in url: + url = 'https://www.pornhub.com/album/{}'.format( + url.replace('pornhub_album_', '')) + elif 'pornhub_' in url: + url = 'https://www.pornhub.com/view_video.php?viewkey={}'\ + .format(url.replace('pornhub_', '')) + if '/authenticate/goToLoggedIn' in url: + qs = utils.query_url(url) + url = urljoin(url, qs['url'][0]) + url = url.replace('pornhubthbh7ap3u.onion', 'pornhub.com') + return url @classmethod def key_id(cls, url): @@ -359,8 +380,10 @@ def get_videos(url, cw=None): session = Session() + domain = utils.domain(url) + if mode in ['pornstar']: - url_main = 'https://www.pornhub.com/{}/{}'.format(mode, username) + url_main = 'https://{}/{}/{}'.format(domain, mode, username) html = downloader.read_html(url_main, session=session) soup = Soup(html) soup = fix_soup(soup, url_main, session, cw) @@ -414,11 +437,11 @@ def get_videos(url, cw=None): try: if mode in ['users', 'model']: if mode == 'users': - url_api = 'https://www.pornhub.com/users/{}/videos/public/'\ - 'ajax?o=mr&page={}'.format(username, p) + url_api = 'https://{}/users/{}/videos/public/'\ + 'ajax?o=mr&page={}'.format(domain, username, p) elif mode == 'model': - url_api = 'https://www.pornhub.com/model/{}/videos/upload/'\ - 'ajax?o=mr&page={}'.format(username, p) + url_api = 'https://{}/model/{}/videos/upload/'\ + 'ajax?o=mr&page={}'.format(domain, username, p) r = session.post(url_api) soup = Soup(r.text) if soup.find('h1'): @@ -426,18 +449,18 @@ def get_videos(url, cw=None): break elif mode in ['pornstar']: if free: - url_api = 'https://www.pornhub.com/{}/{}/videos/upload'\ - '?page={}'.format(mode, username, p) + url_api = 'https://{}/{}/{}/videos/upload'\ + '?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('div', class_='videoUList') else: - url_api = 'https://www.pornhub.com/{}/{}?page={}'.format(mode, username, p) + url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) soup = soup.find('ul', class_='pornstarsVideos') elif mode in ['channels']: - url_api = 'https://www.pornhub.com/{}/{}/videos?page={}'.format(mode, username, p) + url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p) soup = downloader.read_soup(url_api, session=session) soup = fix_soup(soup, url_api, session, cw) try: @@ -445,10 +468,10 @@ def get_videos(url, cw=None): except: break elif mode in ['playlist']: - #url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(username, len(hrefs)) + #url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs)) if token is None: raise Exception('no token') - url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&token={}&page={}'.format(username, token, p) + url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p) soup = downloader.read_soup(url_api, session=session) else: raise NotImplementedError(mode) diff --git a/src/extractor/twitter_downloader.py b/src/extractor/twitter_downloader.py index 677d654..7d68f17 100644 --- a/src/extractor/twitter_downloader.py +++ b/src/extractor/twitter_downloader.py @@ -50,7 +50,7 @@ class Downloader_twitter(Downloader): def init(self): self.session = get_session() #self.url = fix_url(self.url) - self.artist, self.username = get_artist_username(self.url, self.session) + self.artist, self.username = get_artist_username(self.url, self.session, self.cw) if self.username == 'home': raise Exception('No username: home') @@ -97,8 +97,9 @@ def read(self): @lock -def _guest_token(session, headers, cache=True): +def _guest_token(session, headers, cache=True, cw=None): global CACHE_GUEST_TOKEN + print_ = get_print(cw) token = None if cache: if CACHE_GUEST_TOKEN and time() - CACHE_GUEST_TOKEN[1] < TIMEOUT_GUEST_TOKEN: @@ -111,6 +112,9 @@ def _guest_token(session, headers, cache=True): r = session.post('https://api.twitter.com/1.1/guest/activate.json', headers=headers) data = json.loads(r.text) token = data['guest_token'] + print_('token type: {}'.format(type(token))) + if isinstance(token ,int): #3525 + token = str(token) CACHE_GUEST_TOKEN = token, time() return token @@ -133,7 +137,7 @@ def __init__(self, session, cw=None, cache_guest_token=True): print('auth_token:', auth_token) else: # guest token - guest_token = _guest_token(session, session.headers, cache=cache_guest_token) + guest_token = _guest_token(session, session.headers, cache=cache_guest_token, cw=cw) session.headers["x-guest-token"] = guest_token session.cookies.set("gt", guest_token, domain=".twitter.com") print('guest_token:', guest_token) @@ -264,7 +268,7 @@ def _pagination(self, url_api, params=None, entry_tweet="tweet-", entry_cursor=" return params["cursor"] = cursor if params.get("cursor") is None: # nothing - print_('no cursor') + self.print_('no cursor') break @@ -374,7 +378,7 @@ def get_imgs_more(username, session, title, types, n=None, format='[%y-%m-%d] id imgs = imgs or [] print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types))) - artist, username = get_artist_username(username, session)# + artist, username = get_artist_username(username, session, cw)# # Range n = max(n or 0, get_max_range(cw)) @@ -594,13 +598,13 @@ def get(self, _): @try_n(4) -def get_artist_username(url, session): +def get_artist_username(url, session, cw=None): if 'twitter.' not in url: username = url.strip('@') else: id = re.find('/status/([0-9]+)', url) if id: - tweet = TwitterAPI(session).tweet(id, url) + tweet = TwitterAPI(session, cw).tweet(id, url) user_id = tweet['globalObjects']['tweets'][id]['user_id_str'] username = tweet['globalObjects']['users'][user_id]['screen_name'] print('username fixed:', username) @@ -608,7 +612,7 @@ def get_artist_username(url, session): username = re.find('twitter.[^/]+/([^/?]+)', url) if not username: raise Exception('no username') - data = TwitterAPI(session).user_by_screen_name(username) + data = TwitterAPI(session, cw).user_by_screen_name(username) artist = data['legacy']['name'] username = data['legacy']['screen_name'] return artist, username diff --git a/src/extractor/weibo_downloader.py b/src/extractor/weibo_downloader.py index 4f4e826..f6e63dd 100644 --- a/src/extractor/weibo_downloader.py +++ b/src/extractor/weibo_downloader.py @@ -3,7 +3,7 @@ import ree as re from timee import sleep, clock, time from constants import clean_url -from utils import Downloader, urljoin, try_n, Session, get_print, clean_title, Soup, fix_protocol, domain +from utils import Downloader, urljoin, try_n, Session, get_print, clean_title, Soup, fix_protocol, domain, get_max_range import os from translator import tr_ import json @@ -118,6 +118,8 @@ def get_id(url, cw=None): def get_imgs(uid, oid, title, session, cw=None, d=None, parent=None): print_ = get_print(cw) print_('uid: {}, oid:{}'.format(uid, oid)) + + max_pid = get_max_range(cw) @try_n(4) def get_album_imgs(album, page): @@ -168,21 +170,23 @@ def get_albums(page): imgs = [] for album in albums: print('Album:', album.id, album.type) + imgs_album = [] for p in range(1, 101): imgs_new = get_album_imgs(album, p) - imgs += imgs_new + imgs_album += imgs_new s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) if cw: - if not cw.alive: - return [] cw.setTitle(s) else: print(s) + if len(imgs_album) >= max_pid: + break if not imgs_new: break sleep(1) + imgs += imgs_album imgs = sorted(imgs, key=lambda img: img.timestamp, reverse=True) - return imgs + return imgs[:max_pid] diff --git a/src/extractor/youtube_downloader.py b/src/extractor/youtube_downloader.py index 37ad68b..8cfbbf7 100644 --- a/src/extractor/youtube_downloader.py +++ b/src/extractor/youtube_downloader.py @@ -20,6 +20,7 @@ from PyQt import QtCore, QtGui from translator import tr_ from m3u8_tools import dash2stream +from datetime import datetime def print_streams(streams, cw): @@ -74,6 +75,11 @@ def get(self, url, force=False): streams = yt.streams.all() print_streams(streams, cw) + + #3528 + time = datetime.strptime(yt.info['upload_date'], '%Y%m%d') + self.utime = (time-datetime(1970,1,1)).total_seconds() + print_('utime: {}'.format(self.utime)) if type == 'video': streams[:] = [stream for stream in streams if stream.video_codec is not None] @@ -328,6 +334,7 @@ class Downloader_youtube(Downloader): URLS = ['youtube.co', 'youtu.be'] lock = True display_name = 'YouTube' + keep_date = True #3528 def init(self): ui_setting = self.ui_setting