diff --git a/zefie_wtvp_minisrv/client_emu.js b/zefie_wtvp_minisrv/client_emu.js index 3c81b401..f05008f5 100644 --- a/zefie_wtvp_minisrv/client_emu.js +++ b/zefie_wtvp_minisrv/client_emu.js @@ -1,10 +1,15 @@ +const path = require('path'); +const classPath = path.resolve(__dirname + path.sep + "includes" + path.sep + "classes" + path.sep) + path.sep; +require(classPath + "Prototypes.js"); +const WTVSec = require(classPath + "WTVSec.js"); +const WTVShared = require(classPath + "/WTVShared.js")['WTVShared']; +const LZPF = require(classPath + "/LZPF.js"); + const net = require('net'); +const crypto = require('crypto'); const CryptoJS = require('crypto-js'); -const WTVSec = require('./includes/classes/WTVSec.js'); -const e = require('express'); -const WTVShared = require('./includes/classes/WTVShared.js')['WTVShared']; -const LZPF = require('./includes/classes/LZPF.js'); const zlib = require('zlib'); +const AdmZip = require('adm-zip'); /** * WebTV Client Simulator @@ -13,13 +18,22 @@ const zlib = require('zlib'); * using the WTVP protocol with proper authentication and service discovery. */ class WebTVClientSimulator { - constructor(host, port, ssid, url, outputFile = null, maxRedirects = 10, useEncryption = false, request_type_download = false, debug = false, tricks = false) { + constructor(host, port, ssid, url, outputFile = null, maxRedirects = 10, useEncryption = false, request_type_download = false, debug = false, tricks = false, followImages = false, followAll = false, maxDepth = 5, maxRetries = 5) { this.host = host; this.port = port; this.ssid = ssid; this.url = url; this.request_type_download = request_type_download; this.outputFile = outputFile; + this.followImages = followImages; + this.followAll = followAll; + this.maxDepth = maxDepth; + this.maxRetries = maxRetries; + this.currentDepth = 0; + this.downloadedUrls = new Set(); // Track what we've already downloaded + this.pendingDownloads = []; // Queue of {url, referrer} objects to download + this.allContent = new Map(); // Store all downloaded content + this.downloadChecksums = new Map(); // Store expected MD5 checksums for validation this.maxRedirects = maxRedirects; this.useEncryption = useEncryption; this.encryptionEnabled = false; @@ -76,6 +90,14 @@ class WebTVClientSimulator { const lzpf = new LZPF(); const decompressed = lzpf.expand(body); this.debugLog(`LZPF decompression: ${body.length} bytes -> ${decompressed.length} bytes`); + + // Validate decompressed size matches content-length header + const expectedSize = headers['content-length'] ? parseInt(headers['content-length']) : 0; + if (expectedSize > 0 && decompressed.length !== expectedSize) { + console.warn(`LZPF decompression size mismatch: expected ${expectedSize} bytes, got ${decompressed.length} bytes`); + this.debugLog(`LZPF size validation failed - this may indicate incomplete or corrupted data`); + } + return decompressed; } @@ -109,7 +131,7 @@ class WebTVClientSimulator { */ async start() { try { - await this.makeRequest('wtv-1800', '/preregister'); + await this.makeRequestWithRetry('wtv-1800', '/preregister'); } catch (error) { console.error('Failed to start simulation:', error); } @@ -383,12 +405,21 @@ class WebTVClientSimulator { buildEncryptedRequest(serviceName, path, data = null) { const method = data ? 'POST' : 'GET'; let request = `${method} ${serviceName}:${path}\r\n`; - + + // Add Referer header if we have a previous URL + if (this.previousUrl) { + request += `Referer: ${this.previousUrl}\r\n`; + } // For encrypted requests, only include the minimal necessary headers // The SECURE ON already sent the auth and session info - - if (this.request_type_download) request += 'wtv-request-type: download\r\n'; - + + request += `wtv-request-type: ${(this.request_type_download) ? 'download' : 'primary'}\r\n`; + request += `wtv-show-time: 0\r\n`; + request += `wtv-system-cpuspeed: 166187148\r\n`; + request += `wtv-system-sysconfig: 4163328\r\n`; + request += `wtv-disk-size: 8006\r\n`; + request += `wtv-viewer: zefie-minisrv-client_emu\r\n`; // Note: no space after colon + // Add content if POST if (data) { const content = typeof data === 'string' ? data : JSON.stringify(data); @@ -524,6 +555,11 @@ class WebTVClientSimulator { this.hasSeenEncryptedResponse = true; } + // Store content if in follow-all mode + if (this.followAll && body.length > 0) { + this.storeContent(this.previousUrl || this.url, { headers, body, status: statusLine }); + } + // Don't close the current connection - keep it for reuse // The socket will be managed by the socket pool @@ -611,6 +647,11 @@ class WebTVClientSimulator { this.hasSeenEncryptedResponse = true; } + // Store content if in follow-all mode + if (this.followAll && bodyBuf.length > 0) { + this.storeContent(currentUrl || this.previousUrl || this.url, { headers, body: bodyBuf, status: statusLine }); + } + // Check if server wants to close connection if (socket && socketKey) { if (headers['connection'] && headers['connection'].toLowerCase() === 'close') { @@ -821,7 +862,7 @@ class WebTVClientSimulator { const serviceName = match[1]; const path = '/' + (match[2] || ''); this.debugLog(`Parsed service: ${serviceName}, path: ${path}`); - return await this.makeRequest(serviceName, path, (post) ? '1' : null, null); + return await this.makeRequestWithRetry(serviceName, path, (post) ? '1' : null, null); } else { throw new Error(`Invalid wtv-visit URL: ${visitUrl}`); } @@ -844,20 +885,35 @@ class WebTVClientSimulator { this.debugLog(`Parsed target service: ${serviceName}, path: ${path}`); try { - const result = await this.makeRequest(serviceName, path, null, false); + const result = await this.makeRequestWithRetry(serviceName, path, null, false); // Handle the response if (result.body) { this.debugLog('\n*** Target URL Response Body ***'); if (this.outputFile) { - await this.saveToFile(result.body); + if (this.followAll) { + // Store the main content first + this.storeContent(this.url, result); + + // Process all pending downloads + await this.processAllDownloads(); + + // Create comprehensive archive + await this.createComprehensiveArchive(); + } else if (this.followImages) { + await this.saveToFile(result.body, result.headers); + } else { + await this.saveToFile(result.body, result.headers); + } console.log(`Content saved to: ${this.outputFile}`); } else { // Detect text content for CLI output const contentType = result.headers['content-type'] || ''; - if (/^text\//.test(contentType) || /json|xml|javascript/.test(contentType)) { + if (/^text\//.test(contentType) || /json|xml|javascript|download-list/.test(contentType)) { console.log(result.body.toString('utf8')); + } else if (result.body.length === 0) { + console.log(''); } else { console.log(''); } @@ -934,18 +990,905 @@ class WebTVClientSimulator { } /** - * Save content to file + * Save content to file - with optional HTML image following */ - async saveToFile(content) { + async saveToFile(content, headers = {}) { const fs = require('fs').promises; + try { - await fs.writeFile(this.outputFile, Buffer.isBuffer(content) ? content : Buffer.from(content, 'utf8')); + // Check content type for --follow processing + const contentType = headers['content-type'] || ''; + const normalizedContentType = contentType.split(';')[0].trim().toLowerCase(); + + const isHtml = /text\/html/i.test(contentType) || + (typeof content === 'string' && / setTimeout(resolve, 100)); + } catch (error) { + console.warn(`Failed to download referenced file ${fileUrl}: ${error.message}`); + } + } + + // Write the zip file + zip.writeZip(archivePath); + + console.log(`Download-list archive created: ${archivePath}`); + console.log(`Archive contains: ${downloadListPath} + ${downloadedFiles.size} referenced files`); + } + + /** + * Extract image URLs from HTML content + */ + extractImageUrls(html) { + const imageUrls = []; + + // Match img tags with src attributes + const imgTagRegex = /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi; + let match; + + while ((match = imgTagRegex.exec(html)) !== null) { + const src = match[1]; + if (src && !src.startsWith('data:')) { // Skip data URLs + imageUrls.push(src); + } + } + + // Also look for CSS background images + const cssBackgroundRegex = /background-image\s*:\s*url\s*\(\s*["']?([^"')]+)["']?\s*\)/gi; + while ((match = cssBackgroundRegex.exec(html)) !== null) { + const src = match[1]; + if (src && !src.startsWith('data:')) { + imageUrls.push(src); + } + } + + // Look for WebTV-specific image references like + const wtvImageRegex = /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi; + while ((match = wtvImageRegex.exec(html)) !== null) { + const src = match[1]; + if (src && !src.startsWith('data:')) { + imageUrls.push(src); + } + } + + // Remove duplicates and normalize URLs + return [...new Set(imageUrls.map(url => this.normalizeImageUrl(url)))]; + } + + /** + * Normalize image URL (convert relative URLs to absolute WebTV service URLs) + */ + normalizeImageUrl(url) { + // If it's already a full WebTV service URL, return as-is + if (url.match(/^[\w-]+:/)) { + return url; + } + + // If it starts with /, it's relative to the service root + if (url.startsWith('/')) { + // Extract service name from current URL + const currentMatch = this.url.match(/^([\w-]+):/); + if (currentMatch) { + return `${currentMatch[1]}:${url}`; + } + } + + // If it's a relative path, resolve it relative to current path + if (!url.startsWith('http://') && !url.startsWith('https://')) { + const currentMatch = this.url.match(/^([\w-]+):(.*)$/); + if (currentMatch) { + const serviceName = currentMatch[1]; + const currentPath = currentMatch[2]; + const basePath = currentPath.substring(0, currentPath.lastIndexOf('/') + 1); + return `${serviceName}:${basePath}${url}`; + } + } + + return url; + } + + /** + * Download an image from a WebTV service URL + */ + async downloadImage(imageUrl) { + this.debugLog(`Downloading image: ${imageUrl}`); + + try { + // Parse the image URL + const match = imageUrl.match(/^([\w-]+):\/?(.*)/); + if (!match) { + throw new Error(`Invalid image URL format: ${imageUrl}`); + } + + const serviceName = match[1]; + const path = '/' + (match[2] || ''); + + // Make request to download the image + const result = await this.makeRequestWithRetry(serviceName, path, null, true); // Skip redirects for images + + if (result.body && result.body.length > 0) { + this.debugLog(`Downloaded image: ${imageUrl} (${result.body.length} bytes)`); + return { body: result.body, headers: result.headers }; + } else { + throw new Error('Empty response'); + } + } catch (error) { + this.debugLog(`Failed to download image ${imageUrl}: ${error.message}`); + throw error; + } + } + + /** + * Store content and extract links for follow-all mode + */ + storeContent(url, response) { + if (!url || this.downloadedUrls.has(url)) { + return; + } + + // Validate MD5 checksum if we have one for this URL + this.validateDownloadChecksum(url, response.body); + + this.downloadedUrls.add(url); + this.allContent.set(url, response); + this.debugLog(`Stored content for: ${url} (${response.body.length} bytes)`); + + // Extract and queue new URLs if we haven't reached max depth + if (this.currentDepth < this.maxDepth) { + const newUrls = this.extractAllUrls(response.body, response.headers, url); + for (const newUrl of newUrls) { + if (!this.downloadedUrls.has(newUrl) && !this.pendingDownloads.includes(newUrl)) { + this.pendingDownloads.push(newUrl); + this.debugLog(`Queued for download: ${newUrl}`); + } + } + } + } + + /** + * Validate MD5 checksum for downloaded content + */ + validateDownloadChecksum(url, bodyData) { + if (!this.downloadChecksums || !this.downloadChecksums.has(url)) { + return; // No checksum to validate + } + + const expectedChecksum = this.downloadChecksums.get(url); + + try { + // Calculate MD5 hash of the downloaded content + const actualChecksum = crypto.createHash('md5').update(bodyData).digest('hex'); + + if (actualChecksum === expectedChecksum) { + this.debugLog(`✓ MD5 checksum validated for ${url}: ${actualChecksum}`); + } else { + console.warn(`✗ MD5 checksum mismatch for ${url}:`); + console.warn(` Expected: ${expectedChecksum}`); + console.warn(` Actual: ${actualChecksum}`); + console.warn(` This may indicate corrupted or modified content`); + } + } catch (error) { + console.warn(`Failed to validate checksum for ${url}: ${error.message}`); + } + } + + /** + * Extract all URLs from content (links, images, scripts, etc.) + */ + extractAllUrls(body, headers, baseUrl) { + const urls = []; + + try { + // First, check headers for navigation URLs + const headerUrls = this.extractUrlsFromHeaders(headers, baseUrl); + urls.push(...headerUrls); + + // Check for wtv/download-list content type + const contentType = headers['content-type'] || ''; + const normalizedContentType = contentType.split(';')[0].trim().toLowerCase(); + + if (normalizedContentType === 'wtv/download-list') { + const content = Buffer.isBuffer(body) ? body.toString('utf8') : body; + const downloadListUrls = this.extractUrlsFromDownloadList(content); + urls.push(...downloadListUrls); + return urls; // Download lists are special, don't process as HTML + } + + // Only process text content for HTML extraction + if (!/text\/html|text\/plain|application\/.*javascript|text\/css/i.test(contentType)) { + return urls; + } + + const content = Buffer.isBuffer(body) ? body.toString('utf8') : body; + + // Extract various types of URLs + const patterns = [ + // HTML links and form actions + /]+href\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+action\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + + // Images and media + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + + // Scripts and stylesheets + /]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, + /]+href\s*=\s*["']([^"']+)["'][^>]*>/gi, + + // CSS background images + /background-image\s*:\s*url\s*\(\s*["']?([^"')]+)["']?\s*\)/gi, + + // Meta redirects + /]+content\s*=\s*["'][^"']*url=([^"';]+)[^"']*["'][^>]*>/gi, + + // WebTV specific patterns + /wtv-[a-zA-Z0-9-]+:[^\s"'<>]+/gi, + + // WebTV upgradeblock tags + /]+blockurl\s*=\s*["']([^"']+)["'][^>]*>/gi, + ]; + + for (const pattern of patterns) { + let match; + while ((match = pattern.exec(content)) !== null) { + const url = match[1] || match[0]; // Some patterns capture the whole match + if (url && !url.startsWith('data:') && !url.startsWith('javascript:') && + !url.startsWith('mailto:') && !url.startsWith('http://') && + !url.startsWith('https://') && !url.startsWith('client:')) { + const normalizedUrl = this.normalizeUrl(url, baseUrl); + if (normalizedUrl && this.isValidWebTVUrl(normalizedUrl)) { + urls.push(normalizedUrl); + } + } + } + } + + // Look for table-based navigation (common in WebTV) + const tablePattern = /]*>.*?]+href\s*=\s*["']([^"']+)["'][^>]*>.*?<\/td>/gi; + let match; + while ((match = tablePattern.exec(content)) !== null) { + const url = match[1]; + if (url && !url.startsWith('data:') && !url.startsWith('javascript:') && + !url.startsWith('mailto:') && !url.startsWith('http://') && + !url.startsWith('https://') && !url.startsWith('client:')) { + const normalizedUrl = this.normalizeUrl(url, baseUrl); + if (normalizedUrl && this.isValidWebTVUrl(normalizedUrl)) { + urls.push(normalizedUrl); + } + } + } + + } catch (error) { + this.debugLog(`Error extracting URLs from ${baseUrl}: ${error.message}`); + } + + // Remove duplicates + return [...new Set(urls)]; + } + + /** + * Extract URLs from wtv/download-list content + * Parses "location:" lines within GET blocks (separated by double line breaks) + * Also extracts wtv-checksum values for validation + */ + extractUrlsFromDownloadList(content) { + const urls = []; + + // Split content into blocks separated by double line breaks + const blocks = content.split(/\r?\n\r?\n/); + + for (const block of blocks) { + const trimmedBlock = block.trim(); + + // Only process blocks that start with "GET" + if (trimmedBlock.startsWith('GET ')) { + const lines = trimmedBlock.split(/\r?\n/); + let currentUrl = null; + let currentChecksum = null; + + for (const line of lines) { + const trimmed = line.trim(); + + // Look for "location:" lines within GET blocks + if (trimmed.startsWith('location:')) { + currentUrl = trimmed.substring(9).trim(); // Remove "location:" prefix + } + + // Look for "wtv-checksum:" lines within GET blocks + if (trimmed.startsWith('wtv-checksum:')) { + currentChecksum = trimmed.substring(13).trim(); // Remove "wtv-checksum:" prefix + } + } + + // If we found a valid URL, add it (with checksum if available) + if (currentUrl && this.isValidWebTVUrl(currentUrl)) { + urls.push(currentUrl); + this.debugLog(`Found download-list location URL: ${currentUrl}`); + + // Store checksum for later validation if present + if (currentChecksum && currentChecksum !== '00000000000000000000000000000000') { + if (!this.downloadChecksums) { + this.downloadChecksums = new Map(); + } + this.downloadChecksums.set(currentUrl, currentChecksum); + this.debugLog(` -> Expected MD5 checksum: ${currentChecksum}`); + } + } + } + + // Also check for standalone "list" commands (not in GET blocks) + if (trimmedBlock.startsWith('list ')) { + // Extract URL from "list" command (format varies, look for URLs) + const match = trimmedBlock.match(/wtv-[^:\s]+:[^\s]+/); + if (match && this.isValidWebTVUrl(match[0])) { + urls.push(match[0]); + this.debugLog(`Found download-list command URL: ${match[0]}`); + } + } + } + + return urls; + } + + /** + * Extract URLs from response headers (wtv-visit, Location, etc.) + */ + extractUrlsFromHeaders(headers, baseUrl) { + const urls = []; + + try { + // Check for wtv-visit header + if (headers['wtv-visit']) { + const visitUrl = headers['wtv-visit']; + if (visitUrl && !visitUrl.startsWith('client:')) { + const normalizedUrl = this.normalizeUrl(visitUrl, baseUrl); + if (normalizedUrl && this.isValidWebTVUrl(normalizedUrl)) { + urls.push(normalizedUrl); + this.debugLog(`Found wtv-visit URL: ${normalizedUrl}`); + } + } + } + + // Check for Location header (redirects) + if (headers['Location'] || headers['location']) { + const locationUrl = headers['Location'] || headers['location']; + if (locationUrl && !locationUrl.startsWith('client:')) { + const normalizedUrl = this.normalizeUrl(locationUrl, baseUrl); + if (normalizedUrl && this.isValidWebTVUrl(normalizedUrl)) { + urls.push(normalizedUrl); + this.debugLog(`Found Location URL: ${normalizedUrl}`); + } + } + } + + // Check for other WebTV-specific headers that might contain URLs + const urlHeaders = [ + 'wtv-boot-url', + 'wtv-favorite-url', + 'wtv-home-url', + 'wtv-mail-url', + 'wtv-log-url', + 'wtv-phone-log-url', + 'wtv-relogin-url', + 'wtv-reconnect-url', + 'wtv-datadownload-url', + 'wtv-datadownload-login-url', + 'wtv-ssl-certs-download-url', + 'wtv-offline-mail-connect-url', + 'wtv-messenger-login-url', + 'wtv-notifications-url', + 'wtv-addresses-url', + 'wtv-settings-url', + 'wtv-search-url', + 'wtv-explore-url' + ]; + + for (const headerName of urlHeaders) { + if (headers[headerName]) { + const headerUrl = headers[headerName]; + if (headerUrl && !headerUrl.startsWith('client:')) { + const normalizedUrl = this.normalizeUrl(headerUrl, baseUrl); + if (normalizedUrl && this.isValidWebTVUrl(normalizedUrl)) { + urls.push(normalizedUrl); + this.debugLog(`Found ${headerName} URL: ${normalizedUrl}`); + } + } + } + } + + } catch (error) { + this.debugLog(`Error extracting URLs from headers: ${error.message}`); + } + + return urls; + } + + /** + * Check if URL should be followed by the spider + */ + shouldFollowUrl(url) { + // Normalize the URL first + const normalized = this.normalizeUrl(url, 'wtv-disk:/'); + return normalized && this.isValidWebTVUrl(normalized); + } + + /** + * Check if URL is a valid WebTV service URL that should be followed + */ + isValidWebTVUrl(url) { + // Don't follow HTTP/HTTPS URLs or client URLs + if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('client:')) { + return false; + } + + // Only follow WebTV service URLs + return /^wtv-[a-zA-Z0-9-]+:/i.test(url); + } + + /** + * Normalize URL (convert relative URLs to absolute WebTV service URLs) + */ + normalizeUrl(url, baseUrl) { + try { + // Skip URLs we don't want to follow + if (url.startsWith('http://') || url.startsWith('https://') || + url.startsWith('ftp://') || url.startsWith('client:')) { + return null; + } + + // If it's already a full WebTV service URL, return as-is + if (this.isValidWebTVUrl(url)) { + return url; + } + + // If it starts with /, it's relative to the service root + if (url.startsWith('/')) { + const currentMatch = baseUrl.match(/^([\w-]+):/); + if (currentMatch) { + return `${currentMatch[1]}:${url}`; + } + } + + // If it's a relative path, resolve it relative to current path + if (!url.includes('://')) { + const currentMatch = baseUrl.match(/^([\w-]+):(.*)$/); + if (currentMatch) { + const serviceName = currentMatch[1]; + const currentPath = currentMatch[2]; + const basePath = currentPath.substring(0, currentPath.lastIndexOf('/') + 1); + return `${serviceName}:${basePath}${url}`; + } + } + + return null; + } catch (error) { + this.debugLog(`Error normalizing URL ${url}: ${error.message}`); + return null; + } + } + + /** + * Make request with retry logic for ECONNREFUSED errors + */ + async makeRequestWithRetry(serviceName, path, postData = null, downloadMode = false, retryCount = 0) { + try { + return await this.makeRequest(serviceName, path, postData, downloadMode); + } catch (error) { + if (error.code === 'ECONNREFUSED' && retryCount < this.maxRetries) { + const retryDelay = 5000; // 5 seconds + this.debugLog(`Connection refused, retrying in ${retryDelay/1000}s (attempt ${retryCount + 1}/${this.maxRetries})`); + await new Promise(resolve => setTimeout(resolve, retryDelay)); + return this.makeRequestWithRetry(serviceName, path, postData, downloadMode, retryCount + 1); + } + throw error; // Re-throw if not ECONNREFUSED or max retries exceeded + } + } + + /** + * Process all pending downloads + */ + async processAllDownloads() { + this.debugLog(`\n*** Starting aggressive crawl mode - depth ${this.maxDepth} ***`); + + while (this.pendingDownloads.length > 0 && this.currentDepth < this.maxDepth) { + const currentBatch = [...this.pendingDownloads]; + this.pendingDownloads = []; + this.currentDepth++; + + this.debugLog(`\n--- Processing depth ${this.currentDepth} (${currentBatch.length} URLs) ---`); + + for (const url of currentBatch) { + if (this.downloadedUrls.has(url)) { + continue; // Skip if already downloaded + } + + try { + this.debugLog(`Downloading: ${url}`); + const match = url.match(/^([\w-]+):\/?(.*)/); + if (match) { + const serviceName = match[1]; + const path = '/' + (match[2] || ''); + const result = await this.makeRequestWithRetry(serviceName, path, null, true); + this.storeContent(url, result); + } + } catch (error) { + console.warn(`Failed to download ${url}: ${error.message}`); + } + + // Small delay to avoid overwhelming the server + await new Promise(resolve => setTimeout(resolve, 50)); + } + } + + this.debugLog(`\n*** Crawl complete - downloaded ${this.downloadedUrls.size} URLs ***`); + } + + /** + * Create comprehensive archive with all downloaded content + */ + async createComprehensiveArchive() { + const fs = require('fs').promises; + + this.debugLog('Creating comprehensive archive with all downloaded content...'); + + // Determine output filename - change extension to .zip if needed + let archivePath = this.outputFile; + if (!archivePath.endsWith('.zip')) { + const ext = path.extname(archivePath); + archivePath = archivePath.replace(ext, '.zip'); + } + + // Create zip archive + const zip = new AdmZip(); + + // Add all downloaded content + let addedFiles = 0; + for (const [url, response] of this.allContent) { + try { + const servicePath = this.getServicePath(url, response.headers || {}); + zip.addFile(servicePath, response.body); + addedFiles++; + this.debugLog(`Added to archive: ${servicePath} (from ${url})`); + + // Log content type for debugging + const contentType = response.headers ? response.headers['content-type'] : 'unknown'; + if (contentType === 'text/tellyscript' || contentType === 'text/dialscript') { + this.debugLog(` -> TellyScript/DialScript content detected, saved as .tok file`); + } + } catch (error) { + console.warn(`Failed to add ${url} to archive: ${error.message}`); + } + } + + // Write the zip file + zip.writeZip(archivePath); + + console.log(`Comprehensive archive created: ${archivePath}`); + console.log(`Archive contains ${addedFiles} files from ${this.downloadedUrls.size} URLs`); + + // Print summary by service + const serviceStats = {}; + for (const url of this.downloadedUrls) { + const serviceName = url.match(/^([\w-]+):/)?.[1] || 'unknown'; + serviceStats[serviceName] = (serviceStats[serviceName] || 0) + 1; + } + + console.log('\nContent by service:'); + for (const [service, count] of Object.entries(serviceStats)) { + console.log(` ${service}: ${count} files`); + } + } + + /** + * Get file extension based on content-type header + */ + getExtensionFromContentType(contentType) { + if (!contentType) return null; + + const type = contentType.toLowerCase().split(';')[0].trim(); + + // Map content types to file extensions + const typeMap = { + // WebTV specific + 'text/tellyscript': '.tok', + 'text/dialscript': '.tok', + 'wtv/download-list': '.txt', + + // Images + 'image/gif': '.gif', + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/png': '.png', + 'image/bmp': '.bmp', + 'image/webp': '.webp', + 'image/svg+xml': '.svg', + 'image/x-icon': '.ico', + 'image/tiff': '.tiff', + + // Text/HTML + 'text/html': '.html', + 'text/plain': '.txt', + 'text/css': '.css', + 'text/javascript': '.js', + 'application/javascript': '.js', + 'application/x-javascript': '.js', + + // Audio/Video + 'audio/mpeg': '.mp3', + 'audio/wav': '.wav', + 'video/mpeg': '.mpg', + 'video/quicktime': '.mov', + + // Other + 'application/pdf': '.pdf', + 'application/zip': '.zip', + 'application/x-shockwave-flash': '.swf' + }; + + return typeMap[type] || null; + } + + /** + * Convert WebTV service URL to service/path format for zip structure + */ + getServicePath(url, headers = {}) { + try { + // Parse WebTV service URL: wtv-service:/path or wtv-service:path + const match = url.match(/^([\w-]+):\/?(.*)/); + if (match) { + const serviceName = match[1]; + let pathPart = match[2] || ''; + + // Remove query string and fragment + pathPart = pathPart.split('?')[0].split('#')[0]; + + // Remove leading slash if present + pathPart = pathPart.replace(/^\/+/, ''); + + // Check content type for extension determination + const contentType = headers['content-type'] || ''; + const contentTypeExt = this.getExtensionFromContentType(contentType); + + // If path is empty or ends with /, add appropriate filename + if (!pathPart || pathPart.endsWith('/')) { + // Determine file extension based on content type first, then fallback to URL patterns + const ext = contentTypeExt || + (serviceName.includes('image') || url.includes('image') ? '.jpg' : + serviceName === 'wtv-home' || serviceName === 'wtv-guide' ? '.html' : + serviceName === 'wtv-content' ? '.html' : + url.includes('.html') || url.includes('.htm') ? '.html' : + url.includes('.jpg') || url.includes('.jpeg') ? '.jpg' : + url.includes('.png') ? '.png' : + url.includes('.gif') ? '.gif' : + url.includes('.css') ? '.css' : + url.includes('.js') ? '.js' : '.html'); + + const filename = pathPart ? 'index' + ext : serviceName.replace('wtv-', '') + ext; + pathPart = pathPart + filename; + } + + // Ensure the path has a file extension if it doesn't already + if (!pathPart.includes('.')) { + // Determine extension based on content type first, then context + const ext = contentTypeExt || + (serviceName.includes('image') || pathPart.includes('image') || + pathPart.match(/\.(jpe?g|png|gif|bmp|webp|svg|ico|tiff)$/i) ? '.jpg' : + serviceName.includes('style') || pathPart.includes('style') || pathPart.includes('css') ? '.css' : + serviceName.includes('script') || pathPart.includes('script') || pathPart.includes('js') ? '.js' : '.html'); + pathPart += ext; + } + + // Override extension if content type provides a more specific one + if (contentTypeExt && !pathPart.endsWith(contentTypeExt)) { + pathPart = pathPart.replace(/\.[^.]*$/, contentTypeExt); + } + + // Clean up path separators and ensure valid filename + pathPart = pathPart.replace(/\/+/g, '/').replace(/\/$/, ''); + + return `${serviceName}/${pathPart}`; + } + } catch (error) { + this.debugLog(`Error creating service path from ${url}: ${error.message}`); + } + + // Fallback: create a reasonable path structure + const serviceName = url.match(/^([\w-]+):/)?.[1] || 'unknown-service'; + const filename = this.getFilenameFromUrl(url, headers); + return `${serviceName}/${filename}`; + } + + /** + * Extract filename from URL + */ + getFilenameFromUrl(url, headers = {}) { + try { + // Check content type first for extension determination + const contentType = headers['content-type'] || ''; + const contentTypeExt = this.getExtensionFromContentType(contentType); + + // Remove WebTV service prefix and extract path + const match = url.match(/^(?:[\w-]+:\/?)?(.*?)(?:\?.*)?$/); + if (match) { + const pathPart = match[1]; + const filename = pathPart.split('/').pop(); + + // If no filename or just path, generate one based on content + if (!filename || filename === pathPart || filename === '') { + // Determine extension based on content type first, then URL patterns + const ext = contentTypeExt || + (url.includes('.html') || url.includes('.htm') ? '.html' : + url.includes('.jpg') || url.includes('.jpeg') ? '.jpg' : + url.includes('.png') ? '.png' : + url.includes('.gif') ? '.gif' : + url.includes('.bmp') ? '.bmp' : + url.includes('.webp') ? '.webp' : + url.includes('.svg') ? '.svg' : + url.includes('.ico') ? '.ico' : + url.includes('.tiff') ? '.tiff' : + url.includes('.js') ? '.js' : + url.includes('.css') ? '.css' : ''); + + // Generate filename based on service and path + const serviceName = url.match(/^([\w-]+):/)?.[1] || 'content'; + const pathHash = require('crypto').createHash('md5').update(pathPart).digest('hex').substring(0, 8); + return `${serviceName}_${pathHash}${ext}`; + } + + // Ensure filename has appropriate extension if missing + if (!filename.includes('.')) { + const ext = contentTypeExt || + (url.includes('image') || /\.(jpe?g|png|gif|bmp|webp|svg|ico|tiff)$/i.test(url) ? '.jpg' : + url.includes('style') || url.includes('css') ? '.css' : + url.includes('script') || url.includes('js') ? '.js' : ''); + return filename + ext; + } + + // Override extension if content type provides a more specific one + if (contentTypeExt && !filename.endsWith(contentTypeExt)) { + return filename.replace(/\.[^.]*$/, contentTypeExt); + } + + return filename; + } + } catch (error) { + this.debugLog(`Error extracting filename from ${url}: ${error.message}`); + } + + // Fallback to generic filename with hash + const hash = require('crypto').createHash('md5').update(url).digest('hex').substring(0, 8); + const contentType = headers['content-type'] || ''; + const contentTypeExt = this.getExtensionFromContentType(contentType); + const ext = contentTypeExt || ''; + return `content_${hash}${ext}`; + } + /** * Clean up resources */ @@ -979,6 +1922,10 @@ function parseArgs() { maxRedirects: 10, useEncryption: false, request_type_download: false, + followImages: false, + followAll: false, + maxDepth: 3, + maxRetries: 5, debug: false }; @@ -1010,11 +1957,12 @@ function parseArgs() { } break; case '--file': + case '--output': if (i + 1 < args.length) { config.outputFile = args[++i]; } break; - case '--download': + case '--dl-mode': config.request_type_download = true; break; case '--tricks': @@ -1026,6 +1974,23 @@ function parseArgs() { case '--debug': config.debug = true; break; + case '--follow': + config.followImages = true; + break; + case '--follow-all': + config.followAll = true; + config.followImages = true; // follow-all implies follow + break; + case '--depth': + if (i + 1 < args.length) { + config.maxDepth = parseInt(args[++i]); + } + break; + case '--retries': + if (i + 1 < args.length) { + config.maxRetries = parseInt(args[++i]); + } + break; case '--help': console.log(` WebTV Client Simulator @@ -1039,14 +2004,20 @@ Options: --url Target URL to fetch after authentication (default: wtv-home:/home) --file Save response body to file instead of echoing to CLI --max-redirects Maximum number of wtv-visit redirects (default: 10) - --download Enable 'wtv-request-type: download' for diskmap testing + --dl-mode Enable 'wtv-request-type: download' for diskmap testing on minisrv --encryption Enable RC4 encryption after authentication - --tricks-access Enable tricks access for the target URL + --tricks-access Enable tricks access for the target URL (requires wtv-tricks:/access?url= on server) + --follow Download HTML and all referenced images into a zip archive + --follow-all Aggressively download everything encountered (spider mode) + --depth Maximum crawl depth for --follow-all mode (default: 5) + --retries Maximum number of retries for ECONNREFUSED errors (default: 5) --debug Enable debug logging --help Show this help message Example: node client_emu.js --host 192.168.1.100 --port 1615 --ssid 8100000000000001 --url wtv-home:/home --file output.html + node client_emu.js --host 127.0.0.1 --url wtv-home:/home --file archive.zip --follow --debug + node client_emu.js --host 127.0.0.1 --url wtv-home:/home --file complete.zip --follow-all --depth 2 --debug `); process.exit(0); } @@ -1060,7 +2031,7 @@ Example: */ async function main() { const config = parseArgs(); - const simulator = new WebTVClientSimulator(config.host, config.port, config.ssid, config.url, config.outputFile, config.maxRedirects, config.useEncryption, config.request_type_download, config.debug, config.useTricksAccess); + const simulator = new WebTVClientSimulator(config.host, config.port, config.ssid, config.url, config.outputFile, config.maxRedirects, config.useEncryption, config.request_type_download, config.debug, config.useTricksAccess, config.followImages, config.followAll, config.maxDepth, config.maxRetries); // Handle graceful shutdown process.on('SIGINT', () => {