251 lines
8.8 KiB
Python
251 lines
8.8 KiB
Python
# -*- coding: utf-8 -*-
|
||
import re
|
||
import sys
|
||
from pyquery import PyQuery as pq
|
||
from base64 import b64decode, b64encode
|
||
from requests import Session
|
||
|
||
sys.path.append('..')
|
||
from base.spider import Spider
|
||
|
||
|
||
class Spider(Spider):
|
||
def init(self, extend=""):
|
||
self.headers['referer'] = f'{self.host}/'
|
||
self.session = Session()
|
||
self.session.headers.update(self.headers)
|
||
|
||
def getName(self):
|
||
return "JAV目录大全"
|
||
|
||
def isVideoFormat(self, url):
|
||
pass
|
||
|
||
def manualVideoCheck(self):
|
||
pass
|
||
|
||
def destroy(self):
|
||
pass
|
||
|
||
host = "https://javmenu.com"
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-full-version': '"133.0.6943.98"',
|
||
'sec-ch-ua-arch': '"x86"',
|
||
'sec-ch-ua-platform': '"Windows"',
|
||
'sec-ch-ua-platform-version': '"19.0.0"',
|
||
'sec-ch-ua-model': '""',
|
||
'sec-ch-ua-full-version-list': '"Not(A:Brand";v="99.0.0.0", "Google Chrome";v="133.0.6943.98", "Chromium";v="133.0.6943.98"',
|
||
'dnt': '1',
|
||
'upgrade-insecure-requests': '1',
|
||
'sec-fetch-site': 'none',
|
||
'sec-fetch-mode': 'navigate',
|
||
'sec-fetch-user': '?1',
|
||
'sec-fetch-dest': 'document',
|
||
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
'priority': 'u=0, i'
|
||
}
|
||
|
||
# -------------------- 业务接口 --------------------
|
||
def homeContent(self, filter):
|
||
cateManual = {
|
||
"FC2在线": "/zh/fc2/online",
|
||
"成人动画": "/zh/hanime/online",
|
||
"国产在线": "/zh/chinese/online",
|
||
"有码在线": "/zh/censored/online",
|
||
"无码在线": "/zh/uncensored/online",
|
||
"欧美在线": "/zh/western/online"
|
||
}
|
||
classes = [{'type_name': k, 'type_id': v} for k, v in cateManual.items()]
|
||
return {'class': classes}
|
||
|
||
def homeVideoContent(self):
|
||
data = self.getpq("/zh")
|
||
return {'list': self.getlist(data(".video-list-item"))}
|
||
|
||
def categoryContent(self, tid, pg, filter, extend):
|
||
url = f"{self.host}{tid}" if pg == '1' else f"{self.host}{tid}?page={pg}"
|
||
data = self.getpq(url)
|
||
return {
|
||
'list': self.getlist(data(".video-list-item")),
|
||
'page': pg,
|
||
'pagecount': 9999,
|
||
'limit': 90,
|
||
'total': 999999
|
||
}
|
||
|
||
def detailContent(self, ids):
|
||
vod_id = ids[0]
|
||
if not vod_id.startswith('http'):
|
||
url = f"{self.host}{vod_id}"
|
||
else:
|
||
url = vod_id
|
||
vod_id = vod_id.replace(self.host, '')
|
||
data = self.getpq(url)
|
||
vod = {
|
||
'vod_id': vod_id,
|
||
'vod_name': data('h1').text() or data('title').text().split(' - ')[0],
|
||
'vod_pic': self.getCover(data),
|
||
'vod_content': data('.card-text').text() or '',
|
||
'vod_director': '',
|
||
'vod_actor': self.getActors(data),
|
||
'vod_area': '日本',
|
||
'vod_year': self.getYear(data('.text-muted').text()),
|
||
'vod_remarks': self.getRemarks(data),
|
||
'vod_play_from': 'JAV在线',
|
||
'vod_play_url': self.getPlaylist(data, url)
|
||
}
|
||
return {'list': [vod]}
|
||
|
||
def searchContent(self, key, quick, pg="1"):
|
||
url = f"{self.host}/zh/search?wd={key}&page={pg}"
|
||
data = self.getpq(url)
|
||
return {'list': self.getlist(data(".video-list-item"))}
|
||
|
||
def playerContent(self, flag, id, vipFlags):
|
||
return {'parse': 0, 'url': self.d64(id), 'header': self.headers}
|
||
|
||
# -------------------- 私有工具 --------------------
|
||
def getlist(self, data):
|
||
vlist = []
|
||
for item in data.items():
|
||
link = item('a').attr('href')
|
||
if not link or '/zh/' not in link:
|
||
continue
|
||
link = link.replace(self.host, '') if link.startswith(self.host) else link
|
||
name = item('.card-title').text() or item('img').attr('alt') or ''
|
||
if not name:
|
||
continue
|
||
vlist.append({
|
||
'vod_id': link,
|
||
'vod_name': name.split(' - ')[0].strip(),
|
||
'vod_pic': self.getListPicture(item),
|
||
'vod_remarks': (item('.text-muted').text() or '').strip(),
|
||
'style': {'ratio': 1.5, 'type': 'rect'}
|
||
})
|
||
return vlist
|
||
|
||
# ******** 修复版本:支持LazyLoad和正确过滤 ********
|
||
def getListPicture(self, item):
|
||
"""
|
||
获取列表中的图片
|
||
支持LazyLoad延迟加载机制
|
||
过滤水印、占位符和无预览图
|
||
"""
|
||
# 获取所有img标签
|
||
imgs = item('img')
|
||
|
||
for img in imgs.items():
|
||
# 优先级:先从data-src获取(LazyLoad属性),再从src获取
|
||
pic = img.attr('data-src') or img.attr('src')
|
||
|
||
# 过滤条件:排除水印、占位符、加载图片
|
||
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png']):
|
||
return pic
|
||
|
||
return ''
|
||
|
||
def getCover(self, data):
|
||
"""
|
||
获取详情页的图片
|
||
支持LazyLoad延迟加载机制
|
||
过滤水印、占位符和无预览图
|
||
"""
|
||
# 获取所有img标签
|
||
imgs = data('img')
|
||
|
||
for img in imgs.items():
|
||
# 优先级:先从data-src获取(LazyLoad属性),再从src获取
|
||
pic = img.attr('data-src') or img.attr('src')
|
||
|
||
# 过滤条件:排除水印、占位符、加载图片
|
||
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png', 'website_building']):
|
||
return pic
|
||
|
||
return ''
|
||
|
||
# **********************************
|
||
def getActors(self, data):
|
||
"""获取演员信息"""
|
||
actors = []
|
||
h1_text = data('h1').text()
|
||
if h1_text:
|
||
actors.extend(h1_text.strip().split()[1:])
|
||
actor_links = data('a[href*="/actor/"]')
|
||
for actor_link in actor_links.items():
|
||
actor_text = actor_link.text()
|
||
if actor_text and actor_text not in actors:
|
||
actors.append(actor_text)
|
||
return ','.join(actors) if actors else '未知'
|
||
|
||
def getYear(self, date_str):
|
||
"""从日期字符串中提取年份"""
|
||
m = re.search(r'(\d{4})-\d{2}-\d{2}', date_str or '')
|
||
return m.group(1) if m else ''
|
||
|
||
def getRemarks(self, data):
|
||
"""获取备注信息(标签)"""
|
||
tags = [tag.text() for tag in data('.badge').items() if tag.text()]
|
||
return ' '.join(set(tags)) if tags else ''
|
||
|
||
def getPlaylist(self, data, url):
|
||
"""
|
||
获取播放列表
|
||
从source、video标签和脚本中提取m3u8链接
|
||
"""
|
||
play_urls, seen = [], set()
|
||
|
||
# 从source标签获取
|
||
for src in data('source').items():
|
||
u = src.attr('src')
|
||
if u and u not in seen:
|
||
play_urls.append(f"源{len(play_urls)+1}${self.e64(u)}")
|
||
seen.add(u)
|
||
|
||
# 从video标签获取
|
||
for u in data('video').items():
|
||
u = u.attr('src')
|
||
if u and u not in seen:
|
||
play_urls.append(f"线路{len(play_urls)+1}${self.e64(u)}")
|
||
seen.add(u)
|
||
|
||
# 从脚本中提取m3u8链接
|
||
for m in re.findall(r'https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*', data('script').text()):
|
||
if m not in seen:
|
||
play_urls.append(f"线路{len(play_urls)+1}${self.e64(m)}")
|
||
seen.add(m)
|
||
|
||
# 如果没有找到播放链接,使用页面URL
|
||
if not play_urls:
|
||
play_urls.append(f"在线播放${self.e64(url)}")
|
||
|
||
return '#'.join(play_urls)
|
||
|
||
def getpq(self, path=''):
|
||
"""获取页面内容并返回PyQuery对象"""
|
||
url = path if path.startswith('http') else f'{self.host}{path}'
|
||
try:
|
||
rsp = self.session.get(url, timeout=20)
|
||
rsp.encoding = 'utf-8'
|
||
return pq(rsp.text)
|
||
except Exception as e:
|
||
print(f"getpq error: {e}")
|
||
return pq('')
|
||
|
||
def e64(self, text):
|
||
"""Base64编码"""
|
||
try:
|
||
return b64encode(text.encode('utf-8')).decode('utf-8')
|
||
except Exception:
|
||
return ''
|
||
|
||
def d64(self, encoded_text):
|
||
"""Base64解码"""
|
||
try:
|
||
return b64decode(encoded_text.encode('utf-8')).decode('utf-8')
|
||
except Exception:
|
||
return '' |