Files
Mimante/Mimante/Services/ClosedAuctionsScraper.cs
Alberto Balbo 61f0945db2 Supporto PostgreSQL, statistiche avanzate e nuova UI
Aggiornamento massivo: aggiunto backend PostgreSQL per statistiche aste con fallback SQLite, nuovi modelli e servizi, UI moderna con grafici interattivi, refactoring stato applicazione (ApplicationStateService), documentazione completa per deploy Docker/Unraid/Gitea, nuovi CSS e script JS per UX avanzata, template Unraid, test database, e workflow CI/CD estesi. Pronto per produzione e analisi avanzate.
2026-01-18 17:52:05 +01:00

370 lines
17 KiB
C#

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AutoBidder.Models;
namespace AutoBidder.Services
{
/// <summary>
/// Semplice scraper che scarica la pagina delle "closed auctions" di Bidoo,
/// estrae i link alle singole aste, visita ciascuna pagina e prova ad estrarre
/// informazioni utili (nome prodotto, prezzo finale, vincitore, puntate usate).
/// Risultato salvato in CSV per analisi statistiche esterne.
///
/// Nota: il parsing è basato su euristiche (regex) per resistere a vari formati HTML.
/// </summary>
public class ClosedAuctionsScraper
{
private readonly HttpClient _http;
private readonly StatsService? _statsService;
private readonly Action<string>? _log;
public ClosedAuctionsScraper(HttpMessageHandler? handler = null, StatsService? statsService = null, Action<string>? log = null)
{
var h = handler ?? new HttpClientHandler
{
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate | DecompressionMethods.Brotli
};
_http = new HttpClient(h)
{
Timeout = TimeSpan.FromSeconds(10)
};
// Default headers user-like
_http.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
_http.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
_http.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Language", "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7");
_statsService = statsService;
_log = log;
}
/// <summary>
/// Scarica la pagina di aste chiuse, estrae i link ed esegue scraping per ogni asta.
/// Salva il risultato in CSV.
/// </summary>
public async Task ScrapeAndSaveCsvAsync(string closedAuctionsUrl, string outputCsvPath)
{
var results = await ScrapeAsync(closedAuctionsUrl);
SaveCsv(results, outputCsvPath);
}
/// <summary>
/// Scarica la pagina di aste chiuse, estrae i link ed esegue scraping per ogni asta.
/// Ritorna la lista dei record (non salva su disco).
/// </summary>
public async Task<List<ClosedAuctionRecord>> ScrapeAsync(string closedAuctionsUrl)
{
var list = new List<ClosedAuctionRecord>();
await foreach (var rec in ScrapeYieldAsync(closedAuctionsUrl))
{
list.Add(rec);
}
return list;
}
/// <summary>
/// Scarica la pagina di aste chiuse e produce i record uno per uno (yield) per permettere aggiornamenti UI incrementali.
/// </summary>
public async IAsyncEnumerable<ClosedAuctionRecord> ScrapeYieldAsync(string closedAuctionsUrl)
{
if (string.IsNullOrWhiteSpace(closedAuctionsUrl)) throw new ArgumentNullException(nameof(closedAuctionsUrl));
var baseUri = new Uri("https://it.bidoo.com/");
_log?.Invoke($"[scraper] Downloading closed auctions page: {closedAuctionsUrl}");
var html = await GetStringAsync(closedAuctionsUrl);
if (html == null)
{
_log?.Invoke("[scraper] ERROR: unable to download closed auctions page");
throw new InvalidOperationException("Impossibile scaricare la pagina delle aste chiuse.");
}
var auctionUrls = ExtractAuctionLinks(html, baseUri).Distinct().ToList();
_log?.Invoke($"[scraper] Found {auctionUrls.Count} auction links on closed auctions page.");
foreach (var auctionUrl in auctionUrls)
{
ClosedAuctionRecord record;
try
{
_log?.Invoke($"[scraper] Fetching auction page: {auctionUrl}");
var contextInfo = ExtractSummaryInfoForUrl(html, auctionUrl);
var auctionHtml = await GetStringAsync(auctionUrl);
if (auctionHtml == null)
{
_log?.Invoke($"[scraper] WARNING: failed to download auction page: {auctionUrl}");
throw new InvalidOperationException("Download auction page failed");
}
var productName = contextInfo?.ProductName ?? ExtractProductNameFromAuctionHtml(auctionHtml);
var finalPrice = contextInfo?.FinalPrice ?? ExtractFinalPriceFromAuctionHtml(auctionHtml);
var winner = contextInfo?.Winner ?? ExtractWinnerFromAuctionHtml(auctionHtml);
var bidsUsed = ExtractBidsUsedFromAuctionHtml(auctionHtml);
_log?.Invoke($"[scraper] Parsed: Name='{productName}', FinalPrice={(finalPrice.HasValue? finalPrice.Value.ToString("F2", CultureInfo.InvariantCulture):"null")}, Winner='{winner}', BidsUsed={(bidsUsed.HasValue?bidsUsed.Value.ToString():"null")}" );
// Ensure HTML entities decoded already by helper methods
record = new ClosedAuctionRecord
{
AuctionUrl = auctionUrl,
ProductName = productName,
FinalPrice = finalPrice,
Winner = winner,
BidsUsed = bidsUsed,
ScrapedAt = DateTime.UtcNow,
Notes = string.Empty
};
// Record stats if service provided (fire-and-forget)
// DEPRECATED: RecordClosedAuctionAsync removed - use RecordAuctionCompletedAsync
// if (_statsService != null)
// {
// _statsService.RecordClosedAuctionAsync(record);
// }
}
catch (Exception ex)
{
_log?.Invoke($"[scraper] ERROR parsing auction {auctionUrl}: {ex.Message}");
record = new ClosedAuctionRecord
{
AuctionUrl = auctionUrl,
ProductName = "(parse error)",
FinalPrice = null,
Winner = null,
BidsUsed = null,
ScrapedAt = DateTime.UtcNow,
Notes = ex.Message
};
}
yield return record;
}
}
private async Task<string?> GetStringAsync(string url)
{
try
{
var uri = new Uri(url, UriKind.RelativeOrAbsolute);
if (!uri.IsAbsoluteUri)
{
uri = new Uri(new Uri("https://it.bidoo.com"), url);
}
var req = new HttpRequestMessage(HttpMethod.Get, uri);
req.Headers.TryAddWithoutValidation("Referer", "https://it.bidoo.com/");
var resp = await _http.SendAsync(req);
resp.EnsureSuccessStatusCode();
var txt = await resp.Content.ReadAsStringAsync();
_log?.Invoke($"[scraper] HTTP {resp.StatusCode} {uri}");
return txt;
}
catch (Exception ex)
{
_log?.Invoke($"[scraper] HTTP ERROR fetching {url}: {ex.Message}");
return null;
}
}
private IEnumerable<string> ExtractAuctionLinks(string closedHtml, Uri baseUri)
{
var urls = new List<string>();
// Cerca attributi data-href
var mh = Regex.Matches(closedHtml, "data-href\\s*=\\s*\\\"(?<u>[^\\\"]+)\\\"", RegexOptions.IgnoreCase);
foreach (Match m in mh)
{
var u = m.Groups["u"].Value.Trim();
if (!string.IsNullOrEmpty(u)) urls.Add(ToAbsolute(u, baseUri));
}
// fallback: cerca link a auction.php?a=
var mh2 = Regex.Matches(closedHtml, "href\\s*=\\s*\\\"(?<u>[^\\\"]*auction.php\\?a=[^\\\"]+)\\\"", RegexOptions.IgnoreCase);
foreach (Match m in mh2)
{
var u = m.Groups["u"].Value.Trim();
urls.Add(ToAbsolute(u, baseUri));
}
return urls.Where(u => !string.IsNullOrWhiteSpace(u));
}
private string ToAbsolute(string url, Uri baseUri)
{
try
{
if (url.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || url.StartsWith("https://", StringComparison.OrdinalIgnoreCase))
return url;
if (url.StartsWith("//"))
return "https:" + url;
if (url.StartsWith("/"))
return new Uri(baseUri, url).ToString();
return new Uri(baseUri, "/" + url).ToString();
}
catch
{
return url;
}
}
private (string? ProductName, double? FinalPrice, string? Winner)? ExtractSummaryInfoForUrl(string closedHtml, string auctionUrl)
{
try
{
var idx = closedHtml.IndexOf(auctionUrl, StringComparison.OrdinalIgnoreCase);
if (idx < 0) return null;
var start = Math.Max(0, idx - 800);
var len = Math.Min(2500, closedHtml.Length - start);
var seg = closedHtml.Substring(start, len);
var namePattern1 = "<b[^>]*class=\\\"media-heading\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>";
var namePattern2 = "<span[^>]*class=\\\"media-heading[^\\\"]*\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>";
var nameMatch = Regex.Match(seg, namePattern1, RegexOptions.IgnoreCase);
if (!nameMatch.Success)
{
nameMatch = Regex.Match(seg, namePattern2, RegexOptions.IgnoreCase);
}
var product = nameMatch.Success ? WebUtility.HtmlDecode(nameMatch.Groups["name"].Value).Trim() : null;
var priceMatch = Regex.Match(seg, "<span[^>]*class=\\\"price\\\"[^>]*>(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
double? price = null;
if (priceMatch.Success) price = ParseEuro(priceMatch.Groups["p"].Value);
var winnerPattern1 = "<span[^>]*class=\\\"username\\\"[^>]*>.*?<span[^>]*class=\\\"offer\\\"[^>]*>(?<w>[^<]+)</span>";
var winnerPattern2 = "<span[^>]*class=\\\"mobile_offerer offer\\\"[^>]*>(?<w>[^<]+)</span>";
var winnerMatch = Regex.Match(seg, winnerPattern1, RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (!winnerMatch.Success)
{
winnerMatch = Regex.Match(seg, winnerPattern2, RegexOptions.IgnoreCase);
}
var winner = winnerMatch.Success ? WebUtility.HtmlDecode(winnerMatch.Groups["w"].Value).Trim() : null;
return (product, price, winner);
}
catch
{
return null;
}
}
private string? ExtractProductNameFromAuctionHtml(string? auctionHtml)
{
if (string.IsNullOrEmpty(auctionHtml)) return null;
var content = auctionHtml ?? string.Empty;
var m = Regex.Match(content, "<h1[^>]*>(?<n>.*?)</h1>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (m.Success) return WebUtility.HtmlDecode(StripTags(m.Groups["n"].Value)).Trim();
m = Regex.Match(content, "<title[^>]*>(?<t>.*?)</title>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (m.Success) return WebUtility.HtmlDecode(StripTags(m.Groups["t"].Value)).Trim();
m = Regex.Match(content, "<b[^>]*class=\\\"media-heading\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>", RegexOptions.IgnoreCase);
if (m.Success) return WebUtility.HtmlDecode(m.Groups["name"].Value).Trim();
return null;
}
private double? ExtractFinalPriceFromAuctionHtml(string? auctionHtml)
{
if (string.IsNullOrEmpty(auctionHtml)) return null;
var content = auctionHtml ?? string.Empty;
var m = Regex.Match(content, "<span[^>]*class=\\\"price\\\"[^>]*>(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
if (m.Success) return ParseEuro(m.Groups["p"].Value);
m = Regex.Match(content, "prez[zo]?[\\\"\\']?[^0-9]{0,30}(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
if (m.Success) return ParseEuro(m.Groups["p"].Value);
m = Regex.Match(content, "([0-9]{1,3}(?:[.,][0-9]{2}))\\s*€", RegexOptions.IgnoreCase);
if (m.Success) return ParseEuro(m.Groups[1].Value);
return null;
}
private string? ExtractWinnerFromAuctionHtml(string? auctionHtml)
{
if (string.IsNullOrEmpty(auctionHtml)) return null;
var content = auctionHtml ?? string.Empty;
var m = Regex.Match(content, "Vincitore[:\\s\\\"]+<[^>]*>(?<w>[^<]+)</", RegexOptions.IgnoreCase);
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
m = Regex.Match(content, "<span[^>]*class=\\\"username\\\"[^>]*>.*?<span[^>]*class=\\\"offer\\\"[^>]*>(?<w>[^<]+)</span>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
m = Regex.Match(content, "mobile_offerer offer\\\"[^>]*>(?<w>[^<]+)<", RegexOptions.IgnoreCase);
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
return null;
}
private int? ExtractBidsUsedFromAuctionHtml(string? auctionHtml)
{
if (string.IsNullOrEmpty(auctionHtml)) return null;
var content = auctionHtml ?? string.Empty;
// 1) Look for the explicit bids-used span: <p ...><span>628</span> Puntate utilizzate</p>
var m = Regex.Match(content, "class=\\\"bids-used\\\"[^>]*>[^<]*<span[^>]*>(?<n>[0-9]{1,7})</span>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (m.Success && int.TryParse(m.Groups["n"].Value, out var val)) return val;
// 2) Look for numeric followed by 'Puntate utilizzate' or similar
m = Regex.Match(content, "(?<n>[0-9]{1,7})\\s*(?:Puntate utilizzate|Puntate usate|puntate utilizzate|puntate usate|puntate)\\b", RegexOptions.IgnoreCase);
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
// 3) Fallbacks used previously
m = Regex.Match(content, "(?<n>[0-9]+)\\s*(?:puntate|Puntate|puntate usate|puntate_usate|pt\\.?|pts)\\b", RegexOptions.IgnoreCase);
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
m = Regex.Match(content, "usato[sx]?\\s*(?<n>[0-9]{1,6})\\s*(?:puntate|pts|pt)\\b", RegexOptions.IgnoreCase);
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
m = Regex.Match(content, "(Puntate\\s*(?:usate|vinte)?)[^0-9]{0,10}(?<n>[0-9]{1,6})", RegexOptions.IgnoreCase);
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
m = Regex.Match(content, "<[^>]*>\\s*(?<n>[0-9]{1,6})\\s*(puntate)\\s*<", RegexOptions.IgnoreCase);
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
return null;
}
private double? ParseEuro(string s)
{
if (string.IsNullOrWhiteSpace(s)) return null;
s = s.Trim();
s = s.Replace(".", "").Replace(',', '.');
if (double.TryParse(s, NumberStyles.Any, CultureInfo.InvariantCulture, out var d)) return d;
return null;
}
private string StripTags(string input)
{
return Regex.Replace(input ?? string.Empty, "<.*?>", string.Empty);
}
private void SaveCsv(IEnumerable<Models.ClosedAuctionRecord> data, string filePath)
{
var sb = new StringBuilder();
sb.AppendLine("AuctionUrl,ProductName,FinalPrice,Winner,BidsUsed,ScrapedAt,Notes");
foreach (var r in data)
{
// Escape quotes
string Escape(string? v) => (v ?? string.Empty).Replace("\"", "\"\"");
var finalPrice = r.FinalPrice.HasValue ? r.FinalPrice.Value.ToString("F2", CultureInfo.InvariantCulture) : string.Empty;
var bidsUsed = r.BidsUsed.HasValue ? r.BidsUsed.Value.ToString() : string.Empty;
var line = $"\"{Escape(r.AuctionUrl)}\",\"{Escape(r.ProductName)}\",{finalPrice},\"{Escape(r.Winner)}\",{bidsUsed},\"{r.ScrapedAt:O}\",\"{Escape(r.Notes)}\"";
sb.AppendLine(line);
}
File.WriteAllText(filePath, sb.ToString(), Encoding.UTF8);
}
}
}