Skip to content

Commit

Permalink
^q^
Browse files Browse the repository at this point in the history
  • Loading branch information
KurtBestor committed Jul 11, 2021
1 parent ed1b70d commit 9a978e4
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 63 deletions.
7 changes: 4 additions & 3 deletions src/extractor/afreeca_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class Downloader_afreeca(Downloader):
single = True
display_name = 'AfreecaTV'

@classmethod
def fix_url(cls, url):
return url.rstrip(' /')

def read(self):
session = Session()
video = get_video(self.url, session, self.cw)
Expand All @@ -52,9 +56,6 @@ def _get_stream(url_m3u8):
@try_n(8)
def get_video(url, session, cw):
print_ = get_print(cw)
while url.strip().endswith('/'):
url = url[:-1]

html = downloader.read_html(url, session=session)
if "document.location.href='https://login." in html:
raise errors.LoginRequired()
Expand Down
28 changes: 10 additions & 18 deletions src/extractor/navertoon_downloader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
# uncompyle6 version 3.5.0
# Python bytecode 2.7 (62211)
# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
# Embedded file name: navertoon_downloader.pyo
# Compiled at: 2019-10-03 10:19:35
import downloader
from utils import Soup, urljoin, Downloader, LazyUrl, get_imgs_already, clean_title, get_ext, get_print
from constants import try_n
Expand Down Expand Up @@ -47,9 +42,14 @@ class Downloader_navertoon(Downloader):
display_name = 'Naver Webtoon'

def init(self):
self.url = get_main(self.url)
self.__info, _ = get_pages(self.url, self.cw)

@classmethod
def fix_url(cls, url):
url = re.sub(r'[?&]page=[0-9]+', '', re.sub(r'[?&]no=[0-9]+', '', url)).replace('m.comic.naver.', 'comic.naver.')
url = url.replace('detail.nhn', 'list.nhn').replace('/detail?', '/list?')
return url.rstrip('#')

@property
def name(self):
id = self.__info.id
Expand All @@ -70,14 +70,6 @@ def read(self):
self.title = self.name


def get_main(url):
url_main = re.sub('[?&]page=[0-9]+', '', re.sub('[?&]no=[0-9]+', '', url)).replace('detail.nhn', 'list.nhn').replace('m.comic.naver.', 'comic.naver.')
while url_main.endswith('#'):
url_main = url_main[:-1]

return url_main


def set_no(url, p):
if '&no=' not in url:
url = url + ('&no={}').format(p)
Expand All @@ -101,7 +93,7 @@ def set_page(url, p):
@try_n(4)
def get_pages(url, cw=None):
print_ = get_print(cw)
url = get_main(url).replace('comic.naver.', 'm.comic.naver.')
url = Downloader_navertoon.fix_url(url).replace('comic.naver.', 'm.comic.naver.')
id = get_id(url)
print('id:', id)
print(url)
Expand All @@ -119,7 +111,7 @@ def get_pages(url, cw=None):

raise Exception(title)

print('artist:', artist)
print_('artist: {}'.format(artist))
title = soup.find('meta', {'property': 'og:title'}).attrs['content']
pages = []
nos = set()
Expand All @@ -134,7 +126,7 @@ def get_pages(url, cw=None):
view = soup.findAll('ul', class_='section_episode_list')[(-1)]
for lst in view.findAll('li'):
url_page = urljoin(url, lst.find('a').attrs['href'])
if 'detail.nhn' not in url_page.lower():
if 'detail.nhn' not in url_page.lower() and 'detail?' not in url_page.lower(): #3540
continue
print_('url_page: {}'.format(url_page))
text = lst.find('strong', class_='title').find('span', class_='name').text.strip()
Expand All @@ -160,7 +152,7 @@ def get_pages(url, cw=None):
@page_selector.register('navertoon')
@try_n(4)
def f(url):
url = get_main(url)
url = Downloader_navertoon.fix_url(url)
info, pages = get_pages(url)
return pages

Expand Down
15 changes: 11 additions & 4 deletions src/extractor/pixiv_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from locker import lock
import threading
from ratelimit import limits, sleep_and_retry
##import asyncio
FORCE_LOGIN = True
LIMIT = 48
for header in ['pixiv_illust', 'pixiv_bmk', 'pixiv_search', 'pixiv_following', 'pixiv_following_r18']:
Expand Down Expand Up @@ -60,10 +61,16 @@ def key_id(cls, url):
return url.replace('://www.', '://').replace('/en/', '/')

def read(self):
info = get_info(self.url, self.cw)
for img in info['imgs']:
self.urls.append(img.url)
self.title = clean_title(info['title'])
## loop = asyncio.new_event_loop()
## asyncio.set_event_loop(loop)
try:
info = get_info(self.url, self.cw)
for img in info['imgs']:
self.urls.append(img.url)
self.title = clean_title(info['title'])
finally:
## loop.close()
pass


class PixivAPIError(errors.LoginRequired): pass
Expand Down
73 changes: 48 additions & 25 deletions src/extractor/pornhub_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import utils
from m3u8_tools import playlist2stream, M3u8_stream
import ytdl
import errors



Expand Down Expand Up @@ -53,6 +54,7 @@ class Video(object):
thumb = None

def __init__(self, url, cw, session):
url = Downloader_pornhub.fix_url(url)
self.url = LazyUrl(url, self.get, self)
self.cw = cw
self.session = session
Expand All @@ -68,11 +70,22 @@ def get(self, url):
return self._url

id_ = re.find(r'viewkey=(\w+)', url, re.IGNORECASE) or \
re.find(r'/embed/(\w+)', url, re.IGNORECASE)
print('id: {}'.format(id_))
re.find(r'/embed/(\w+)', url, re.IGNORECASE, err='no id')
print_('id: {}'.format(id_))
if 'viewkey=' not in url.lower() and '/gif/' not in url.lower():
url = urljoin(url, '/view_video.php?viewkey={}'.format(id_))
html = downloader.read_html(url, session=session)

url_test = url.replace('pornhubpremium.com', 'pornhub.com')
try:
html = downloader.read_html(url_test, session=session)
soup = Soup(html)
if soup.find('div', id='lockedPlayer'):
print_('Locked player')
raise Exception('Locked player')
url = url_test
except: #3511
url = url.replace('pornhub.com', 'pornhubpremium.com')
html = downloader.read_html(url, session=session)

soup = Soup(html)
soup = fix_soup(soup, url, session, cw)
Expand Down Expand Up @@ -173,22 +186,30 @@ class Downloader_pornhub(Downloader):
type = 'pornhub'
single = True
strip_header = False
URLS = ['pornhub.com', 'pornhubpremium.com']
URLS = ['pornhub.com', 'pornhubpremium.com', 'pornhubthbh7ap3u.onion']

def init(self):
self.session = Session() # 1791
if 'pornhub_gif_' in self.url:
self.url = 'https://www.pornhub.com/gif/{}'.format(
self.url.replace('pornhub_gif_', ''))
elif 'pornhub_album_' in self.url:
self.url = 'https://www.pornhub.com/album/{}'.format(
self.url.replace('pornhub_album_', ''))
elif 'pornhub_' in self.url:
self.url = 'https://www.pornhub.com/view_video.php?viewkey={}'\
.format(self.url.replace('pornhub_', ''))
if 'pornhubpremium.com' in self.url.lower() and\
not is_login(self.session, self.cw):
return self.Invalid('[Pornhub] Login cookies required')
raise errors.LoginRequired()

@classmethod
def fix_url(cls, url):
if 'pornhub_gif_' in url:
url = 'https://www.pornhub.com/gif/{}'.format(
url.replace('pornhub_gif_', ''))
elif 'pornhub_album_' in url:
url = 'https://www.pornhub.com/album/{}'.format(
url.replace('pornhub_album_', ''))
elif 'pornhub_' in url:
url = 'https://www.pornhub.com/view_video.php?viewkey={}'\
.format(url.replace('pornhub_', ''))
if '/authenticate/goToLoggedIn' in url:
qs = utils.query_url(url)
url = urljoin(url, qs['url'][0])
url = url.replace('pornhubthbh7ap3u.onion', 'pornhub.com')
return url

@classmethod
def key_id(cls, url):
Expand Down Expand Up @@ -359,8 +380,10 @@ def get_videos(url, cw=None):

session = Session()

domain = utils.domain(url)

if mode in ['pornstar']:
url_main = 'https://www.pornhub.com/{}/{}'.format(mode, username)
url_main = 'https://{}/{}/{}'.format(domain, mode, username)
html = downloader.read_html(url_main, session=session)
soup = Soup(html)
soup = fix_soup(soup, url_main, session, cw)
Expand Down Expand Up @@ -414,41 +437,41 @@ def get_videos(url, cw=None):
try:
if mode in ['users', 'model']:
if mode == 'users':
url_api = 'https://www.pornhub.com/users/{}/videos/public/'\
'ajax?o=mr&page={}'.format(username, p)
url_api = 'https://{}/users/{}/videos/public/'\
'ajax?o=mr&page={}'.format(domain, username, p)
elif mode == 'model':
url_api = 'https://www.pornhub.com/model/{}/videos/upload/'\
'ajax?o=mr&page={}'.format(username, p)
url_api = 'https://{}/model/{}/videos/upload/'\
'ajax?o=mr&page={}'.format(domain, username, p)
r = session.post(url_api)
soup = Soup(r.text)
if soup.find('h1'):
print('break: h1')
break
elif mode in ['pornstar']:
if free:
url_api = 'https://www.pornhub.com/{}/{}/videos/upload'\
'?page={}'.format(mode, username, p)
url_api = 'https://{}/{}/{}/videos/upload'\
'?page={}'.format(domain, mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('div', class_='videoUList')
else:
url_api = 'https://www.pornhub.com/{}/{}?page={}'.format(mode, username, p)
url_api = 'https://{}/{}/{}?page={}'.format(domain, mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
soup = soup.find('ul', class_='pornstarsVideos')
elif mode in ['channels']:
url_api = 'https://www.pornhub.com/{}/{}/videos?page={}'.format(mode, username, p)
url_api = 'https://{}/{}/{}/videos?page={}'.format(domain, mode, username, p)
soup = downloader.read_soup(url_api, session=session)
soup = fix_soup(soup, url_api, session, cw)
try:
soup = soup.find('div', {'id': 'channelsBody'}).find('div', class_='rightSide')
except:
break
elif mode in ['playlist']:
#url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(username, len(hrefs))
#url_api = 'https://{}/playlist/viewChunked?id={}&offset={}&itemsPerPage=40'.format(domain, username, len(hrefs))
if token is None:
raise Exception('no token')
url_api = 'https://www.pornhub.com/playlist/viewChunked?id={}&token={}&page={}'.format(username, token, p)
url_api = 'https://{}/playlist/viewChunked?id={}&token={}&page={}'.format(domain, username, token, p)
soup = downloader.read_soup(url_api, session=session)
else:
raise NotImplementedError(mode)
Expand Down
20 changes: 12 additions & 8 deletions src/extractor/twitter_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class Downloader_twitter(Downloader):
def init(self):
self.session = get_session()
#self.url = fix_url(self.url)
self.artist, self.username = get_artist_username(self.url, self.session)
self.artist, self.username = get_artist_username(self.url, self.session, self.cw)
if self.username == 'home':
raise Exception('No username: home')

Expand Down Expand Up @@ -97,8 +97,9 @@ def read(self):


@lock
def _guest_token(session, headers, cache=True):
def _guest_token(session, headers, cache=True, cw=None):
global CACHE_GUEST_TOKEN
print_ = get_print(cw)
token = None
if cache:
if CACHE_GUEST_TOKEN and time() - CACHE_GUEST_TOKEN[1] < TIMEOUT_GUEST_TOKEN:
Expand All @@ -111,6 +112,9 @@ def _guest_token(session, headers, cache=True):
r = session.post('https://api.twitter.com/1.1/guest/activate.json', headers=headers)
data = json.loads(r.text)
token = data['guest_token']
print_('token type: {}'.format(type(token)))
if isinstance(token ,int): #3525
token = str(token)
CACHE_GUEST_TOKEN = token, time()
return token

Expand All @@ -133,7 +137,7 @@ def __init__(self, session, cw=None, cache_guest_token=True):
print('auth_token:', auth_token)
else:
# guest token
guest_token = _guest_token(session, session.headers, cache=cache_guest_token)
guest_token = _guest_token(session, session.headers, cache=cache_guest_token, cw=cw)
session.headers["x-guest-token"] = guest_token
session.cookies.set("gt", guest_token, domain=".twitter.com")
print('guest_token:', guest_token)
Expand Down Expand Up @@ -264,7 +268,7 @@ def _pagination(self, url_api, params=None, entry_tweet="tweet-", entry_cursor="
return
params["cursor"] = cursor
if params.get("cursor") is None: # nothing
print_('no cursor')
self.print_('no cursor')
break


Expand Down Expand Up @@ -374,7 +378,7 @@ def get_imgs_more(username, session, title, types, n=None, format='[%y-%m-%d] id
imgs = imgs or []
print_('imgs: {}, types: {}'.format(len(imgs), ', '.join(types)))

artist, username = get_artist_username(username, session)#
artist, username = get_artist_username(username, session, cw)#

# Range
n = max(n or 0, get_max_range(cw))
Expand Down Expand Up @@ -594,21 +598,21 @@ def get(self, _):


@try_n(4)
def get_artist_username(url, session):
def get_artist_username(url, session, cw=None):
if 'twitter.' not in url:
username = url.strip('@')
else:
id = re.find('/status/([0-9]+)', url)
if id:
tweet = TwitterAPI(session).tweet(id, url)
tweet = TwitterAPI(session, cw).tweet(id, url)
user_id = tweet['globalObjects']['tweets'][id]['user_id_str']
username = tweet['globalObjects']['users'][user_id]['screen_name']
print('username fixed:', username)
else:
username = re.find('twitter.[^/]+/([^/?]+)', url)
if not username:
raise Exception('no username')
data = TwitterAPI(session).user_by_screen_name(username)
data = TwitterAPI(session, cw).user_by_screen_name(username)
artist = data['legacy']['name']
username = data['legacy']['screen_name']
return artist, username
Expand Down
Loading

0 comments on commit 9a978e4

Please sign in to comment.