Files
a/PY1/JAV目录.py
2026-03-24 18:40:17 +08:00

251 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import sys
from pyquery import PyQuery as pq
from base64 import b64decode, b64encode
from requests import Session
sys.path.append('..')
from base.spider import Spider
class Spider(Spider):
def init(self, extend=""):
self.headers['referer'] = f'{self.host}/'
self.session = Session()
self.session.headers.update(self.headers)
def getName(self):
return "JAV目录大全"
def isVideoFormat(self, url):
pass
def manualVideoCheck(self):
pass
def destroy(self):
pass
host = "https://javmenu.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-full-version': '"133.0.6943.98"',
'sec-ch-ua-arch': '"x86"',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"19.0.0"',
'sec-ch-ua-model': '""',
'sec-ch-ua-full-version-list': '"Not(A:Brand";v="99.0.0.0", "Google Chrome";v="133.0.6943.98", "Chromium";v="133.0.6943.98"',
'dnt': '1',
'upgrade-insecure-requests': '1',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'priority': 'u=0, i'
}
# -------------------- 业务接口 --------------------
def homeContent(self, filter):
cateManual = {
"FC2在线": "/zh/fc2/online",
"成人动画": "/zh/hanime/online",
"国产在线": "/zh/chinese/online",
"有码在线": "/zh/censored/online",
"无码在线": "/zh/uncensored/online",
"欧美在线": "/zh/western/online"
}
classes = [{'type_name': k, 'type_id': v} for k, v in cateManual.items()]
return {'class': classes}
def homeVideoContent(self):
data = self.getpq("/zh")
return {'list': self.getlist(data(".video-list-item"))}
def categoryContent(self, tid, pg, filter, extend):
url = f"{self.host}{tid}" if pg == '1' else f"{self.host}{tid}?page={pg}"
data = self.getpq(url)
return {
'list': self.getlist(data(".video-list-item")),
'page': pg,
'pagecount': 9999,
'limit': 90,
'total': 999999
}
def detailContent(self, ids):
vod_id = ids[0]
if not vod_id.startswith('http'):
url = f"{self.host}{vod_id}"
else:
url = vod_id
vod_id = vod_id.replace(self.host, '')
data = self.getpq(url)
vod = {
'vod_id': vod_id,
'vod_name': data('h1').text() or data('title').text().split(' - ')[0],
'vod_pic': self.getCover(data),
'vod_content': data('.card-text').text() or '',
'vod_director': '',
'vod_actor': self.getActors(data),
'vod_area': '日本',
'vod_year': self.getYear(data('.text-muted').text()),
'vod_remarks': self.getRemarks(data),
'vod_play_from': 'JAV在线',
'vod_play_url': self.getPlaylist(data, url)
}
return {'list': [vod]}
def searchContent(self, key, quick, pg="1"):
url = f"{self.host}/zh/search?wd={key}&page={pg}"
data = self.getpq(url)
return {'list': self.getlist(data(".video-list-item"))}
def playerContent(self, flag, id, vipFlags):
return {'parse': 0, 'url': self.d64(id), 'header': self.headers}
# -------------------- 私有工具 --------------------
def getlist(self, data):
vlist = []
for item in data.items():
link = item('a').attr('href')
if not link or '/zh/' not in link:
continue
link = link.replace(self.host, '') if link.startswith(self.host) else link
name = item('.card-title').text() or item('img').attr('alt') or ''
if not name:
continue
vlist.append({
'vod_id': link,
'vod_name': name.split(' - ')[0].strip(),
'vod_pic': self.getListPicture(item),
'vod_remarks': (item('.text-muted').text() or '').strip(),
'style': {'ratio': 1.5, 'type': 'rect'}
})
return vlist
# ******** 修复版本支持LazyLoad和正确过滤 ********
def getListPicture(self, item):
"""
获取列表中的图片
支持LazyLoad延迟加载机制
过滤水印、占位符和无预览图
"""
# 获取所有img标签
imgs = item('img')
for img in imgs.items():
# 优先级先从data-src获取LazyLoad属性再从src获取
pic = img.attr('data-src') or img.attr('src')
# 过滤条件:排除水印、占位符、加载图片
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png']):
return pic
return ''
def getCover(self, data):
"""
获取详情页的图片
支持LazyLoad延迟加载机制
过滤水印、占位符和无预览图
"""
# 获取所有img标签
imgs = data('img')
for img in imgs.items():
# 优先级先从data-src获取LazyLoad属性再从src获取
pic = img.attr('data-src') or img.attr('src')
# 过滤条件:排除水印、占位符、加载图片
if pic and not any(keyword in pic for keyword in ['button_logo', 'no_preview', 'loading.gif', 'loading.png', 'website_building']):
return pic
return ''
# **********************************
def getActors(self, data):
"""获取演员信息"""
actors = []
h1_text = data('h1').text()
if h1_text:
actors.extend(h1_text.strip().split()[1:])
actor_links = data('a[href*="/actor/"]')
for actor_link in actor_links.items():
actor_text = actor_link.text()
if actor_text and actor_text not in actors:
actors.append(actor_text)
return ','.join(actors) if actors else '未知'
def getYear(self, date_str):
"""从日期字符串中提取年份"""
m = re.search(r'(\d{4})-\d{2}-\d{2}', date_str or '')
return m.group(1) if m else ''
def getRemarks(self, data):
"""获取备注信息(标签)"""
tags = [tag.text() for tag in data('.badge').items() if tag.text()]
return ' '.join(set(tags)) if tags else ''
def getPlaylist(self, data, url):
"""
获取播放列表
从source、video标签和脚本中提取m3u8链接
"""
play_urls, seen = [], set()
# 从source标签获取
for src in data('source').items():
u = src.attr('src')
if u and u not in seen:
play_urls.append(f"{len(play_urls)+1}${self.e64(u)}")
seen.add(u)
# 从video标签获取
for u in data('video').items():
u = u.attr('src')
if u and u not in seen:
play_urls.append(f"线路{len(play_urls)+1}${self.e64(u)}")
seen.add(u)
# 从脚本中提取m3u8链接
for m in re.findall(r'https?://[^\s"\'<>]+\.m3u8[^\s"\'<>]*', data('script').text()):
if m not in seen:
play_urls.append(f"线路{len(play_urls)+1}${self.e64(m)}")
seen.add(m)
# 如果没有找到播放链接使用页面URL
if not play_urls:
play_urls.append(f"在线播放${self.e64(url)}")
return '#'.join(play_urls)
def getpq(self, path=''):
"""获取页面内容并返回PyQuery对象"""
url = path if path.startswith('http') else f'{self.host}{path}'
try:
rsp = self.session.get(url, timeout=20)
rsp.encoding = 'utf-8'
return pq(rsp.text)
except Exception as e:
print(f"getpq error: {e}")
return pq('')
def e64(self, text):
"""Base64编码"""
try:
return b64encode(text.encode('utf-8')).decode('utf-8')
except Exception:
return ''
def d64(self, encoded_text):
"""Base64解码"""
try:
return b64decode(encoded_text.encode('utf-8')).decode('utf-8')
except Exception:
return ''