Files
a/PY1/badnews (1).py
2026-03-24 18:40:17 +08:00

179 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
from base.spider import Spider
class Spider(Spider):
def __init__(self):
self.name = 'Bad.news'
self.host = 'https://bad.news'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36',
'Referer': self.host + '/',
'Origin': self.host,
'Accept-Language': 'zh-CN,zh;q=0.9'
}
def getName(self):
return self.name
def init(self, extend=""):
pass
# =========================
# 首页分类
# =========================
def homeContent(self, filter):
return {
'class': [
{'type_id': '', 'type_name': '新出品'},
{'type_id': '/dm', 'type_name': 'H动漫'},
{'type_id': '/av/release', 'type_name': '日本AV'},
{'type_id': '/tag/long-porn', 'type_name': '长视频'}
]
}
def homeVideoContent(self):
return self.categoryContent('', '1', False, {})
# =========================
# 列表解析
# =========================
def parse_list(self, html):
videos = []
# 定义黑名单关键词
black_list = ['热点', '招聘', '20k', '工作制', '双休', '远程', '月薪']
# 1. 解析瀑布流 (p1)
p1 = re.findall(
r'href="([^"]+)"[^>]*title="([^"]+)"[^>]*(?:data-echo-background|poster)="([^"]+)"',
html, re.S
)
for path, title, pic in p1:
# 过滤逻辑:检查标题是否包含黑名单中的任何词
if any(word in title for word in black_list):
continue
if path.startswith('/'):
videos.append({
'vod_id': path,
'vod_name': title.strip(),
'vod_pic': pic.split('?')[0],
'vod_remarks': ''
})
# 2. 解析 table 信息流 (p2)
p2 = re.findall(r'<table.*?>(.*?)</table>', html, re.S)
for block in p2:
# 先提取标题进行预校验
title_m = re.search(r'<h3.*?>(.*?)</h3>', block, re.S)
raw_title = re.sub('<[^>]+>', '', title_m.group(1)).strip() if title_m else ''
# 如果标题为空或者是黑名单广告,直接跳过
if not raw_title or any(word in raw_title for word in black_list):
continue
link = re.search(r'href="([^"]+)"', block)
if not link:
continue
path = link.group(1)
if not path.startswith('/') or any(v['vod_id'] == path for v in videos):
continue
pic_m = re.search(r'poster="([^"]+)"', block)
videos.append({
'vod_id': path,
'vod_name': raw_title,
'vod_pic': pic_m.group(1).split('?')[0] if pic_m else '',
'vod_remarks': ''
})
return videos
# =========================
# 分类
# =========================
def categoryContent(self, tid, pg, filter, extend):
pg = int(pg)
url = f'{self.host}{tid}/page-{pg}' if tid else (self.host if pg == 1 else f'{self.host}/page-{pg}')
res = self.fetch(url, headers=self.headers)
return {'list': self.parse_list(res.text), 'page': pg, 'pagecount': 999}
# =========================
# 详情页HTML + DM 分流)
# =========================
def detailContent(self, ids):
path = ids[0]
url = self.host + path
html = self.fetch(url, headers=self.headers).text
title_m = re.search(r'<title>(.*?)</title>', html)
title = title_m.group(1).split('-')[0].strip() if title_m else 'Bad.news'
# ===== DMH动漫=========
if path.startswith('/dm'):
iframe = re.search(r'<iframe[^>]+src="([^"]+)"', html)
play_url = iframe.group(1) if iframe else url
if play_url.startswith('/'):
play_url = self.host + play_url
return {'list': [{
'vod_id': play_url,
'vod_name': title,
'vod_play_from': 'DM-Web',
'vod_play_url': f'播放${play_url}'
}]}
# ===== 普通 HTML 视频 =====
m = re.search(r'<video[^>]+data-source="([^"]+)"', html)
if m:
return {'list': [{
'vod_id': path,
'vod_name': title,
'vod_play_from': 'HTML',
'vod_play_url': f'播放${m.group(1)}'
}]}
return {'list': []}
# =========================
# 播放器
# =========================
def playerContent(self, flag, id, vipFlags):
headers = {
'User-Agent': self.headers['User-Agent'],
'Referer': self.host + '/',
'Origin': self.host,
'Range': 'bytes=0-'
}
# DM 用 WebView 嗅探
if flag == 'DM-Web':
return {
'parse': 1,
'sniff': 1,
'url': id,
'header': headers,
'sniff_include': ['.mp4', '.m3u8'],
'sniff_exclude': [
'.html', '.js', '.css',
'.jpg', '.png', '.gif',
'google', 'facebook',
'doubleclick', 'analytics',
'ads', 'tracker'
]
}
# HTML 直连
return {'parse': 0, 'url': id}
# =========================
# 搜索
# =========================
def searchContent(self, key, quick, pg="1"):
url = f'{self.host}/search/q-{key}'
res = self.fetch(url, headers=self.headers)
return {'list': self.parse_list(res.text)}