Scrapix, un « aspirateur » web (web crawler) simple orienté VCL qui gère : la récupération HTTP, l’extraction de liens et de ressources (images, documents, audio, vidéo, ressources web), le respect optionnel de robots.txt, un mécanisme facultatif de téléchargement des ressources, des limites (nombre de fichiers trouvés, nombre de pages explorées), et des mises à jour UI thread-safe vers un TscListView et un TscStatusBar.
Le crawler est conçu pour être lancé depuis un thread d’arrière-plan et pour mettre à jour l’interface en toute sécurité via des wrappers TThread.Queue.
Il expose des commandes pour démarrer, mettre en pause, reprendre, annuler et attendre l’arrêt.
671302
Scrapix.Core.pas
unit Scrapix.Core;
interface
uses
{Winapi}
WinApi.Windows, WinApi.Messages,
{System}
System.SysUtils, System.Classes, System.Generics.Collections,
System.RegularExpressions, System.Types, System.Net.HttpClient,
System.Net.URLClient, System.IOUtils, System.Threading, System.SyncObjs,
{Vcl}
Vcl.ComCtrls, Vcl.StdCtrls, Vcl.CheckLst, Vcl.Graphics,
{StyleControls VCL}
scControls,
{Translate.Core}
Translate.Core;
const
// États d'exploration
STATE_RUNNING = 0;
STATE_PAUSED = 1;
STATE_CANCEL = 2;
// User-Agent utilisé pour toutes les requêtes HTTP
UserAgent = 'Scrapix/1.0';
type
TScrapix = class
private
{ --- Données de suivi et compteurs --- }
VisitedLinks: TDictionary
FFoundFiles: TDictionary
FBrokenLinks: TDictionary
TotalLinks: Integer;
FileCount: Integer;
BrokenCount: Integer;
FRobotsBlocked: Integer;
FLinksTraversed: Integer;
{ --- Contrôle d'exécution --- }
FState: Integer;
FPauseEvent: TEvent;
FStoppedEvent: TEvent;
{ --- Paramètres de crawl --- }
RequestTimeoutMs: Integer;
RequestDelayMs: Integer;
SameDomainOnly: Boolean;
RootDomain: string;
{ --- Téléchargement --- }
FAutoDownload: Boolean;
DownloadFolder: string;
{ --- Robots.txt --- }
RobotsRules: TDictionary
FRespectRobots: Boolean;
{ --- Filtres de ressources --- }
FSearchImages: Boolean;
FSearchDocuments: Boolean;
FSearchAudio: Boolean;
FSearchVideo: Boolean;
FSearchWeb: Boolean;
{ --- Exécution courante --- }
FRunning: Boolean;
FMaxDepth: Integer;
{ --- Chemins de rapports temporaires --- }
FVisitedFilePath: string;
FBrokenFilePath: string;
FFoundFilePath: string;
FLogFilePath: string;
FLogLock: TCriticalSection;
FDisableUIUpdates: Boolean;
{ --- Limites configurables --- }
FFoundFilesLimit: Integer;
FExploreLimit: Integer;
{ --- Fonctions internes --- }
// Retourne true si les mises à jour UI sont autorisées
function UIUpdatesAllowed: Boolean;
// Récupère le contenu d'une URL via GET et met à jour le ListView (thread-safe)
function GetWebContent(const URL: string; ListView: TscListView;
Depth: Integer; Logging: TscListBox): string;
// Extrait les liens et normalise les URL
procedure ExtractLinks(const HTML: string; BaseURL: string;
var LinkList: TStringList);
// Extrait sources médias (images, docs, audio, vidéo, ressources web) selon les flags actifs
procedure ExtractMediaSources(const HTML: string; BaseURL: string;
var ImageList, DocList, AudioList, VideoList, WebList: TStringList);
// Incrémente le compteur d'URL bloquées par robots.txt et met à jour le StatusBar
procedure IncrementRobotsBlocked(StatusBar: TscStatusBar);
// Incrémente le compteur de liens parcourus et applique la limite d'exploration
procedure IncrementLinksTraversed(StatusBar: TscStatusBar);
// Traite et enregistre un lien cassé : UI, dictionnaire, fichier rapport
procedure MarkBrokenLink(const URL: string; ListView: TscListView;
StatusBar: TscStatusBar; Logging: TscListBox);
// Vérifie la disponibilité d'un fichier via HEAD; fallback GET range si HEAD échoue
function IsFileAvailable(const URL: string; ListView: TscListView;
StatusBar: TscStatusBar; Depth: Integer; Logging: TscListBox): Boolean;
// Télécharge une ressource dans le dossier de téléchargement (organisé par type)
function DownloadFile(const URL: string; Client: THTTPClient;
out LocalPath: string; Logging: TscListBox): Boolean;
// Routine factorisée pour traiter une liste de ressources (vérif, robots, dispo, download)
procedure ProcessResourceGroup(ResourceList: TStringList;
const AcceptExts: array of string; StatusBar: TscStatusBar;
ListView: TscListView; Depth: Integer; const DefaultUIType: string;
Logging: TscListBox);
// Traite toutes les listes de ressources extraites d'une page (appel ProcessResourceGroup)
procedure ProcessFoundFiles(ImageList, DocList, AudioList, VideoList,
WebList: TStringList; StatusBar: TscStatusBar; ListView: TscListView;
CheckList: TscCheckListBox; Depth: Integer; Logging: TscListBox);
// Exploration récursive d'une page : récupération, extraction, traitement, récursion
procedure ExploreLinksRecursive(const URL: string; ListView: TscListView;
StatusBar: TscStatusBar; CheckList: TscCheckListBox; Depth: Integer;
Logging: TscListBox);
// Vérifie si LinkURL appartient au même domaine que BaseURL (ou sous-domaine)
function IsSameDomain(const BaseURL, LinkURL: string): Boolean;
// Normalise une URL relative ou absolue en URL absolue utilisable
function NormalizeURL(const BaseURL, RelOrAbsURL: string): string;
// Wrappers thread-safe pour mise à jour ListView / StatusBar
procedure SafeUpdateListViewStatus(ListView: TscListView;
const URL, StatusText: string; const Method: string = '');
procedure SafeUpdateListViewDownloadState(ListView: TscListView;
const URL, DownloadState: string);
procedure SafeUpdateListViewInfo(ListView: TscListView; const URL: string;
RespMs: Integer; const SizeBytes: string; Depth: Integer);
procedure SafeScrollListViewToBottom(ListView: TscListView);
procedure SafeSetStatusBarPanel(StatusBar: TscStatusBar;
PanelIndex: Integer; const Text: string);
// Logging thread-safe vers une TscListBox
procedure SafeLog(Logging: TscListBox; const Msg: string);
// robots.txt helpers : parse, cache et vérifie l'autorisation
function ParseRobots(const RobotsText: string;
OutList: TStringList): Boolean;
function EnsureRobotsForHost(const Host, Scheme: string): Boolean;
function IsAllowedByRobots(const URL: string): Boolean;
public
// Constructeur : initialise structure et valeurs par défaut
constructor Create;
// Destructeur : annule, attend et libère ressources
destructor Destroy; override;
// Configure les paramètres du crawl (timeouts, limites, options)
procedure ConfigureCrawl(const ARequestTimeoutMs, ARequestDelayMs: Integer;
ASameDomainOnly: Boolean; AAutoDownload: Boolean; ARespectRobots: Boolean;
AFoundFilesLimit: Integer; AExploreLimit: Integer);
// Démarre l'exploration synchroniquement ; crée rapports si demandé
procedure ExploreLinks(const URL: string; ListView: TscListView;
StatusBar: TscStatusBar; CheckList: TscCheckListBox; MaxDepth: Integer;
SaveBrokenToFile, SaveVisitedToFile, SaveFoundFilesToFile: Boolean;
Logging: TscListBox);
// Mettre en pause, reprendre ou annuler l'exploration
procedure PauseExploration;
procedure ResumeExploration;
procedure CancelExploration;
function IsCanceled: Boolean;
function IsPaused: Boolean;
function IsRunning: Boolean;
// Attend l'arrêt complet (bloquant) ; supprime le paramètre Timeout
function WaitForStop: Boolean;
// Applique les filtres de types de fichiers depuis une CheckList UI
procedure ApplyFileTypeFiltersFromCheckList(CheckList: TscCheckListBox);
// Autorise la désactivation des mises à jour UI (tests, performances)
property DisableUIUpdates: Boolean read FDisableUIUpdates
write FDisableUIUpdates;
end;
implementation
uses
System.Net.Mime, System.StrUtils;
function TScrapix.UIUpdatesAllowed: Boolean;
begin
Result := not FDisableUIUpdates;
end;
procedure TScrapix.SafeScrollListViewToBottom(ListView: TscListView);
begin
if not UIUpdatesAllowed then
Exit;
if ListView = nil then
Exit;
if TThread.Current.ThreadID = MainThreadID then
begin
if (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
if ListView.Items.Count > 0 then
ListView.Items[ListView.Items.Count - 1].MakeVisible(False);
end
else
begin
TThread.Queue(nil,
procedure
begin
if not UIUpdatesAllowed then
Exit;
if (ListView = nil) or (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
if ListView.Items.Count > 0 then
ListView.Items[ListView.Items.Count - 1].MakeVisible(False);
end);
end;
end;
procedure TScrapix.SafeSetStatusBarPanel(StatusBar: TscStatusBar;
PanelIndex: Integer; const Text: string);
begin
if not UIUpdatesAllowed then
Exit;
if StatusBar = nil then
Exit;
if PanelIndex < 0 then
Exit;
if TThread.Current.ThreadID = MainThreadID then
begin
if (csDestroying in StatusBar.ComponentState) or
(not StatusBar.HandleAllocated) then
Exit;
if PanelIndex < StatusBar.Panels.Count then
StatusBar.Panels.Text := Text;
end
else
begin
TThread.Queue(nil,
procedure
begin
if not UIUpdatesAllowed then
Exit;
if (StatusBar = nil) or (csDestroying in StatusBar.ComponentState) or
(not StatusBar.HandleAllocated) then
Exit;
if PanelIndex < StatusBar.Panels.Count then
StatusBar.Panels.Text := Text;
end);
end;
end;
procedure TScrapix.SafeUpdateListViewStatus(ListView: TscListView;
const URL, StatusText: string; const Method: string = '');
var
sURL, sStatus, sMethod: string;
I: Integer;
Item: TListItem;
begin
if not UIUpdatesAllowed then
Exit;
if ListView = nil then
Exit;
sURL := NormalizeURL(URL, URL);
if sURL = '' then
sURL := URL;
sStatus := StatusText;
sMethod := Method;
if TThread.Current.ThreadID = MainThreadID then
begin
if (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
for I := 0 to ListView.Items.Count - 1 do
begin
Item := ListView.Items;
if SameText(Item.Caption, sURL) or SameText(Item.Caption, URL) then
begin
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sStatus;
Item.SubItems := sMethod;
SafeScrollListViewToBottom(ListView);
Exit;
end;
end;
Item := ListView.Items.Add;
Item.Caption := sURL;
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sStatus;
Item.SubItems := sMethod;
SafeScrollListViewToBottom(ListView);
end
else
begin
TThread.Queue(nil,
procedure
var
I2: Integer;
It: TListItem;
begin
if not UIUpdatesAllowed then
Exit;
if (ListView = nil) or (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
for I2 := 0 to ListView.Items.Count - 1 do
begin
It := ListView.Items;
if SameText(It.Caption, sURL) or SameText(It.Caption, URL) then
begin
while It.SubItems.Count < 6 do
It.SubItems.Add('');
It.SubItems := sStatus;
It.SubItems := sMethod;
SafeScrollListViewToBottom(ListView);
Exit;
end;
end;
It := ListView.Items.Add;
It.Caption := sURL;
while It.SubItems.Count < 6 do
It.SubItems.Add('');
It.SubItems := sStatus;
It.SubItems := sMethod;
SafeScrollListViewToBottom(ListView);
end);
end;
end;
procedure TScrapix.SafeUpdateListViewDownloadState(ListView: TscListView;
const URL, DownloadState: string);
var
sURL, sState: string;
I: Integer;
Item: TListItem;
begin
if not UIUpdatesAllowed then
Exit;
if ListView = nil then
Exit;
sURL := NormalizeURL(URL, URL);
if sURL = '' then
sURL := URL;
sState := DownloadState;
if TThread.Current.ThreadID = MainThreadID then
begin
if (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
for I := 0 to ListView.Items.Count - 1 do
begin
Item := ListView.Items;
if SameText(Item.Caption, sURL) or SameText(Item.Caption, URL) then
begin
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sState;
SafeScrollListViewToBottom(ListView);
Exit;
end;
end;
Item := ListView.Items.Add;
Item.Caption := sURL;
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sState;
SafeScrollListViewToBottom(ListView);
end
else
begin
TThread.Queue(nil,
procedure
var
I2: Integer;
It: TListItem;
begin
if not UIUpdatesAllowed then
Exit;
if (ListView = nil) or (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
for I2 := 0 to ListView.Items.Count - 1 do
begin
It := ListView.Items;
if SameText(It.Caption, sURL) or SameText(It.Caption, URL) then
begin
while It.SubItems.Count < 6 do
It.SubItems.Add('');
It.SubItems := sState;
SafeScrollListViewToBottom(ListView);
Exit;
end;
end;
It := ListView.Items.Add;
It.Caption := sURL;
while It.SubItems.Count < 6 do
It.SubItems.Add('');
It.SubItems := sState;
SafeScrollListViewToBottom(ListView);
end);
end;
end;
procedure TScrapix.SafeUpdateListViewInfo(ListView: TscListView;
const URL: string; RespMs: Integer; const SizeBytes: string; Depth: Integer);
var
sURL, sRespMs, sSizeLocal, sDepth: string;
DepthLocal: Integer;
I: Integer;
Item: TListItem;
begin
if not UIUpdatesAllowed then
Exit;
if ListView = nil then
Exit;
sURL := NormalizeURL(URL, URL);
if sURL = '' then
sURL := URL;
if RespMs < 0 then
sRespMs := ''
else
sRespMs := IntToStr(RespMs);
sSizeLocal := SizeBytes;
DepthLocal := Depth;
if DepthLocal > 0 then
sDepth := IntToStr(DepthLocal)
else
sDepth := '';
if TThread.Current.ThreadID = MainThreadID then
begin
if (csDestroying in ListView.ComponentState) or
(not ListView.HandleAllocated) then
Exit;
for I := 0 to ListView.Items.Count - 1 do
begin
Item := ListView.Items;
if SameText(Item.Caption, sURL) or SameText(Item.Caption, URL) then
begin
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sRespMs;
Item.SubItems := sSizeLocal;
if sDepth <> '' then
Item.SubItems := sDepth;
SafeScrollListViewToBottom(ListView);
Exit;
end;
end;
Item := ListView.Items.Add;
Item.Caption := sURL;
while Item.SubItems.Count < 6 do
Item.SubItems.Add('');
Item.SubItems := sRespMs;
Item.SubItems := sSizeLocal;
if sDepth <> '' then
Item.SubItems := sDepth;
SafeScrollListViewToBottom(ListView);
end
else
begin
TThread.Queue(nil,
procedure
begin
if not UIUpdatesAllowed then
Exit;
SafeUpdateListViewInfo(ListView, URL, RespMs, SizeBytes, Depth);
end);
end;
end;
procedure TScrapix.SafeLog(Logging: TscListBox; const Msg: string);
const
HORIZONTAL_MARGIN = 16;
var
NewWidth, CurrExtent, ClientW: Integer;
CanvasHandle: TCanvas;
S: string;
begin
S := Msg;
if UIUpdatesAllowed and (Logging <> nil) then
begin
if TThread.Current.ThreadID = MainThreadID then
begin
if (not(csDestroying in Logging.ComponentState)) and Logging.HandleAllocated
then
begin
try
Logging.Items.Add(S);
Logging.ItemIndex := Logging.Items.Count - 1;
except
end;
try
CanvasHandle := Logging.Canvas;
NewWidth := CanvasHandle.TextWidth(S) + HORIZONTAL_MARGIN;
except
NewWidth := 0;
end;
if NewWidth > 0 then
begin
CurrExtent := SendMessage(Logging.Handle,
LB_GETHORIZONTALEXTENT, 0, 0);
if NewWidth > CurrExtent then
SendMessage(Logging.Handle, LB_SETHORIZONTALEXTENT, NewWidth, 0)
else
begin
ClientW := Logging.ClientWidth;
if CurrExtent < ClientW then
SendMessage(Logging.Handle, LB_SETHORIZONTALEXTENT, ClientW, 0);
end;
end;
end;
end
else
begin
TThread.Queue(nil,
procedure
var
W: Integer;
Ce: Integer;
Cw: Integer;
begin
if not UIUpdatesAllowed then
Exit;
if (Logging = nil) or (csDestroying in Logging.ComponentState) or
(not Logging.HandleAllocated) then
Exit;
try
Logging.Items.Add(S);
Logging.ItemIndex := Logging.Items.Count - 1;
except
end;
try
W := Logging.Canvas.TextWidth(S) + HORIZONTAL_MARGIN;
except
W := 0;
end;
if W > 0 then
begin
Ce := SendMessage(Logging.Handle, LB_GETHORIZONTALEXTENT, 0, 0);
if W > Ce then
SendMessage(Logging.Handle, LB_SETHORIZONTALEXTENT, W, 0)
else
begin
Cw := Logging.ClientWidth;
if Ce < Cw then
SendMessage(Logging.Handle, LB_SETHORIZONTALEXTENT, Cw, 0);
end;
end;
end);
end;
end;
if (FLogFilePath <> '') and Assigned(FLogLock) then
begin
try
FLogLock.Acquire;
try
try
TFile.AppendAllText(FLogFilePath, S + sLineBreak, TEncoding.UTF8);
except
end;
finally
FLogLock.Release;
end;
except
end;
end;
end;
function TScrapix.NormalizeURL(const BaseURL, RelOrAbsURL: string): string;
var
S: string;
BaseUri: TURI;
SchemeHost, BasePath, BaseTrimmed: string;
Idx: Integer;
begin
Result := '';
S := Trim(RelOrAbsURL);
if S = '' then
Exit;
Idx := Pos('#', S);
if Idx > 0 then
Delete(S, Idx, MaxInt);
if S.StartsWith('mailto:', True) or S.StartsWith('javascript:', True) or
S.StartsWith('tel:', True) or S.StartsWith('data:', True) then
Exit;
if S.StartsWith('//') then
begin
Result := 'https:' + S;
Exit;
end;
if S.StartsWith('http://', True) or S.StartsWith('https://', True) then
begin
try
Result := TURI.Create(S).ToString
except
Result := S
end;
Exit;
end;
if BaseURL = '' then
Exit;
try
BaseUri := TURI.Create(BaseURL)
except
Exit
end;
SchemeHost := BaseUri.Scheme + '://' + BaseUri.Host;
if BaseUri.Port > 0 then
SchemeHost := SchemeHost + ':' + IntToStr(BaseUri.Port);
if S.StartsWith('/') then
begin
Result := SchemeHost + S;
Exit;
end;
BasePath := BaseUri.Path;
if (BasePath = '') or BasePath.EndsWith('/') then
BaseTrimmed := BasePath
else
BaseTrimmed := ExtractFilePath(BasePath);
if (BaseTrimmed = '') or (BaseTrimmed <> '/') then
BaseTrimmed := '/' + BaseTrimmed;
Result := SchemeHost + BaseTrimmed;
if not Result.EndsWith('/') then
Result := Result + '/';
Result := Result + S;
try
Result := TURI.Create(Result).ToString
except
end;
end;
function GetResponseHeaderValue(const Resp: IHTTPResponse;
const HeaderName: string): string;
var
I: Integer;
begin
Result := '';
if Resp = nil then
Exit;
for I := 0 to Length(Resp.Headers) - 1 do
if SameText(Resp.Headers.Name, HeaderName) then
Exit(Resp.Headers.Value);
end;
function FormatBytes(const SizeBytes: string): string;
var
Bytes: Int64;
d: Double;
DigitsOnly: string;
begin
Result := '';
if Trim(SizeBytes) = '' then
Exit;
if Trim(SizeBytes).ToLower = 'n/a' then
begin
Result := 'n/a';
Exit;
end;
try
Bytes := StrToInt64(Trim(SizeBytes))
except
DigitsOnly := TRegEx.Replace(SizeBytes, '[^0-9]', '');
if DigitsOnly = '' then
Exit;
try
Bytes := StrToInt64(DigitsOnly)
except
Exit
end;
end;
if Bytes = 0 then
begin
Result := 'n/a';
Exit;
end;
if Bytes < 1024 then
Result := Format('%d Octets', )
else if Bytes < Int64(1024) * 1024 then
begin
d := Bytes / 1024;
Result := FormatFloat('0.##', d) + ' Ko';
end
else if Bytes < Int64(1024) * 1024 * 1024 then
begin
d := Bytes / (1024 * 1024);
Result := FormatFloat('0.##', d) + ' Mo';
end
else
begin
d := Bytes / (1024 * 1024 * 1024);
Result := FormatFloat('0.##', d) + ' Go';
end;
end;
function TScrapix.ParseRobots(const RobotsText: string;
OutList: TStringList): Boolean;
var
Lines: TArray
I: Integer;
CurrentAgents: TStringList;
L: string;
AgentMatched: Boolean;
PathPart: string;
begin
Result := False;
OutList.Clear;
if RobotsText = '' then
Exit;
Lines := RobotsText.Split([#13#10, #10, #13],
TStringSplitOptions.ExcludeEmpty);
CurrentAgents := TStringList.Create;
try
CurrentAgents.Clear;
AgentMatched := False;
for I := 0 to Length(Lines) - 1 do
begin
L := Trim(Lines);
if L = '' then
begin
CurrentAgents.Clear;
AgentMatched := False;
Continue;
end;
if StartsText('User-agent:', L) then
begin
CurrentAgents.Clear;
CurrentAgents.Add(Trim(Copy(L, Length('User-agent:') + 1, MaxInt)
).ToLower);
AgentMatched := SameText(CurrentAgents, 'Scrapix') or
SameText(CurrentAgents, '*');
Continue;
end;
if AgentMatched and StartsText('Disallow:', L) then
begin
PathPart := Trim(Copy(L, Length('Disallow:') + 1, MaxInt));
if PathPart = '' then
Continue;
if not PathPart.StartsWith('/') then
PathPart := '/' + PathPart;
if OutList.IndexOf(PathPart) = -1 then
OutList.Add(PathPart);
end;
end;
finally
CurrentAgents.Free;
end;
Result := True;
end;
function TScrapix.EnsureRobotsForHost(const Host, Scheme: string): Boolean;
var
Key: string;
RobotsURL, RobotsText: string;
Client: THTTPClient;
RespStream: TStringStream;
Resp: IHTTPResponse;
SL, NewSL: TStringList;
begin
if Host = '' then
Exit(False);
Key := LowerCase(Host);
if RobotsRules = nil then
RobotsRules := TDictionary
if RobotsRules.ContainsKey(Key) then
Exit(True);
RobotsURL := Scheme + '://' + Host + '/robots.txt';
Client := THTTPClient.Create;
try
Client.ConnectionTimeout := RequestTimeoutMs;
Client.ResponseTimeout := RequestTimeoutMs;
Client.UserAgent := UserAgent;
RespStream := TStringStream.Create('', TEncoding.UTF8);
try
try
Resp := Client.Get(RobotsURL, RespStream);
RobotsText := RespStream.DataString
except
RobotsText := '';
end;
finally
RespStream.Free;
end;
finally
Client.Free;
end;
SL := TStringList.Create;
try
ParseRobots(RobotsText, SL);
NewSL := TStringList.Create;
NewSL.Assign(SL);
RobotsRules.Add(Key, NewSL);
finally
SL.Free;
end;
Result := True;
end;
function TScrapix.IsAllowedByRobots(const URL: string): Boolean;
var
Host, Scheme, Path: string;
Rules: TStringList;
Key: string;
U: TURI;
I: Integer;
DisallowPath: string;
begin
if not FRespectRobots then
Exit(True);
Result := True;
if URL = '' then
Exit(True);
try
U := TURI.Create(URL);
Host := U.Host;
Scheme := U.Scheme;
Path := U.Path;
if Path = '' then
Path := '/';
except
Exit(True);
end;
Key := LowerCase(Host);
if (RobotsRules = nil) or (not RobotsRules.ContainsKey(Key)) then
EnsureRobotsForHost(Host, Scheme);
if (RobotsRules <> nil) and RobotsRules.ContainsKey(Key) then
begin
Rules := RobotsRules;
for I := 0 to Rules.Count - 1 do
begin
DisallowPath := Rules;
if Path.StartsWith(DisallowPath, True) then
begin
Result := False;
Exit;
end;
end;
end;
end;
constructor TScrapix.Create;
begin
inherited Create;
VisitedLinks := nil;
FFoundFiles := nil;
RequestTimeoutMs := 30000;
RequestDelayMs := 0;
SameDomainOnly := True;
FState := STATE_RUNNING;
FPauseEvent := TEvent.Create(nil, True, True, '');
FStoppedEvent := TEvent.Create(nil, True, True, '');
TotalLinks := 0;
FileCount := 0;
BrokenCount := 0;
FRobotsBlocked := 0;
FLinksTraversed := 0;
FAutoDownload := False;
DownloadFolder := '';
RobotsRules := nil;
FRespectRobots := True;
FSearchImages := True;
FSearchDocuments := True;
FSearchAudio := True;
FSearchVideo := True;
FSearchWeb := True;
FFoundFiles := TDictionary
FBrokenLinks := TDictionary
FRunning := False;
FMaxDepth := 0;
FFoundFilesLimit := 2000;
FExploreLimit := 100;
FVisitedFilePath := '';
FBrokenFilePath := '';
FFoundFilePath := '';
FLogFilePath := '';
FLogLock := TCriticalSection.Create;
FDisableUIUpdates := False;
end;
destructor TScrapix.Destroy;
var
SL: TStringList;
begin
CancelExploration;
WaitForStop;
FreeAndNil(VisitedLinks);
FreeAndNil(FFoundFiles);
FreeAndNil(FBrokenLinks);
FreeAndNil(FPauseEvent);
FreeAndNil(FStoppedEvent);
FreeAndNil(FLogLock);
if Assigned(RobotsRules) then
begin
for SL in RobotsRules.Values do
SL.Free;
RobotsRules.Free;
end;
inherited;
end;
procedure TScrapix.PauseExploration;
begin
TInterlocked.Exchange(FState, STATE_PAUSED);
if Assigned(FPauseEvent) then
FPauseEvent.ResetEvent;
end;
procedure TScrapix.ResumeExploration;
begin
TInterlocked.Exchange(FState, STATE_RUNNING);
if Assigned(FPauseEvent) then
FPauseEvent.SetEvent;
end;
procedure TScrapix.CancelExploration;
begin
TInterlocked.Exchange(FState, STATE_CANCEL);
if Assigned(FPauseEvent) then
FPauseEvent.SetEvent;
end;
function TScrapix.IsCanceled: Boolean;
begin
Result := TInterlocked.CompareExchange(FState, 0, 0) = STATE_CANCEL;
end;
function TScrapix.IsPaused: Boolean;
begin
Result := TInterlocked.CompareExchange(FState, 0, 0) = STATE_PAUSED;
end;
function TScrapix.IsRunning: Boolean;
begin
Result := TInterlocked.CompareExchange(PInteger(@FRunning)^, 0, 0) <> 0;
end;
function TScrapix.WaitForStop: Boolean;
begin
CancelExploration;
if Assigned(FStoppedEvent) then
begin
Result := FStoppedEvent.WaitFor(INFINITE) = wrSignaled;
Exit;
end;
while FRunning do
Sleep(20);
Result := not FRunning;
end;
procedure TScrapix.ConfigureCrawl(const ARequestTimeoutMs, ARequestDelayMs
: Integer; ASameDomainOnly: Boolean; AAutoDownload: Boolean;
ARespectRobots: Boolean; AFoundFilesLimit: Integer; AExploreLimit: Integer);
begin
if ARequestTimeoutMs <= 0 then
RequestTimeoutMs := 30000
else
RequestTimeoutMs := ARequestTimeoutMs;
if ARequestDelayMs < 0 then
RequestDelayMs := 0
else
RequestDelayMs := ARequestDelayMs;
SameDomainOnly := ASameDomainOnly;
FAutoDownload := AAutoDownload;
FRespectRobots := ARespectRobots;
if AFoundFilesLimit < 1 then
FFoundFilesLimit := 1
else if AFoundFilesLimit > 2000 then
FFoundFilesLimit := 2000
else
FFoundFilesLimit := AFoundFilesLimit;
if AExploreLimit < 1 then
FExploreLimit := 1
else if AExploreLimit > 100 then
FExploreLimit := 100
else
FExploreLimit := AExploreLimit;
end;
function TScrapix.GetWebContent(const URL: string; ListView: TscListView;
Depth: Integer; Logging: TscListBox): string;
var
Client: THTTPClient;
Mem: TMemoryStream;
Resp: IHTTPResponse;
ContentType, ContentLength: string;
StatusCode: Integer;
NormURL: string;
StartTick, EndTick, ElapsedMs: Cardinal;
S: RawByteString;
I: Integer;
begin
Result := '';
if URL = '' then
Exit;
NormURL := NormalizeURL(URL, URL);
if NormURL = '' then
Exit;
Client := THTTPClient.Create;
try
Client.ConnectionTimeout := RequestTimeoutMs;
Client.ResponseTimeout := RequestTimeoutMs;
Client.UserAgent := UserAgent;
Mem := TMemoryStream.Create;
try
try
StartTick := GetTickCount;
Resp := Client.Get(NormURL, Mem);
EndTick := GetTickCount;
ElapsedMs := EndTick - StartTick;
StatusCode := -1;
ContentType := '';
ContentLength := '';
if Resp <> nil then
begin
StatusCode := Resp.StatusCode;
ContentType := GetResponseHeaderValue(Resp, 'Content-Type');
ContentLength := GetResponseHeaderValue(Resp, 'Content-Length');
if ContentLength = '' then
ContentLength := GetResponseHeaderValue(Resp, 'Content-Range');
if (ContentLength <> '') and ContentLength.StartsWith('bytes', True)
then
begin
I := LastDelimiter('/', ContentLength);
if I > 0 then
ContentLength := Copy(ContentLength, I + 1, MaxInt);
end;
end;
if ContentLength = '' then
ContentLength := IntToStr(Mem.Size);
if Mem.Size > 0 then
begin
SetLength(S, Mem.Size);
Mem.Position := 0;
Mem.ReadBuffer(S, Mem.Size);
Result := string(S);
end
else
Result := '';
if Assigned(ListView) then
begin
SafeUpdateListViewStatus(ListView, NormURL,
Format('%d %s', [StatusCode, ContentType]), 'GET');
SafeUpdateListViewInfo(ListView, NormURL, ElapsedMs,
FormatBytes(ContentLength), Depth);
end;
if Assigned(Logging) then
SafeLog(Logging, Format('GET %s -> %d %s (%s)', [NormURL, StatusCode,
ContentType, FormatBytes(ContentLength)]));
except
on E: Exception do
begin
if Assigned(ListView) then
begin
SafeUpdateListViewStatus(ListView, NormURL,
'Exception : ' + E.Message, 'GET');
SafeUpdateListViewInfo(ListView, NormURL, -1, '', Depth);
end;
if Assigned(Logging) then
SafeLog(Logging, Format('GET Exception %s : %s',
[NormURL, E.Message]));
Result := '';
end;
end;
finally
Mem.Free;
end;
if RequestDelayMs > 0 then
TThread.Sleep(RequestDelayMs);
finally
Client.Free;
end;
end;
procedure TScrapix.ExtractLinks(const HTML: string; BaseURL: string;
var LinkList: TStringList);
var
Regex: TRegEx;
Match: TMatch;
RawLink, Link: string;
begin
Regex := TRegEx.Create(']*?\s+)?href="([^"]*)"', );
Match := Regex.Match(HTML);
while Match.Success do
begin
RawLink := Match.Groups.Value.Trim;
Match := Match.NextMatch;
if RawLink = '' then
Continue;
Link := NormalizeURL(BaseURL, RawLink);
if Link = '' then
Continue;
if (VisitedLinks = nil) or not VisitedLinks.ContainsKey(Link) then
if LinkList.IndexOf(Link) = -1 then
LinkList.Add(Link);
end;
end;
function RemoveURLParams(const URL: string): string;
var
P: Integer;
begin
Result := URL;
P := Pos('?', Result);
if P > 0 then
Result := Copy(Result, 1, P - 1);
end;
procedure TScrapix.ExtractMediaSources(const HTML: string; BaseURL: string;
var ImageList, DocList, AudioList, VideoList, WebList: TStringList);
var
RegexImg, RegexDoc, RegexAudio, RegexVideo: TRegEx;
RegexCss, RegexJs, RegexFont, RegexHtmlLink: TRegEx;
Match: TMatch;
RawSource, Source: string;
procedure AddIfNew(List: TStringList; const S: string);
begin
if (S <> '') and (TPath.GetExtension(S) <> '') and (List.IndexOf(S) = -1)
then
List.Add(S);
end;
begin
if FSearchImages then
begin
RegexImg := TRegEx.Create
(']*src="([^"]+\.(jpg|jpeg|png|gif|bmp|webp|svg))"',
);
Match := RegexImg.Match(HTML);
while Match.Success do
begin
RawSource := Match.Groups.Value;
Source := RemoveURLParams(RawSource);
Source := NormalizeURL(BaseURL, Source);
AddIfNew(ImageList, Source);
Match := Match.NextMatch;
end;
end;
if FSearchDocuments then
begin
RegexDoc := TRegEx.Create
(']*href="([^"]+\.(pdf|zip|rtf|doc|docx|xls|xlsx|ppt|pptx))"',
);
Match := RegexDoc.Match(HTML);
while Match.Success do
begin
RawSource := Match.Groups.Value;
Source := RemoveURLParams(RawSource);
Source := NormalizeURL(BaseURL, Source);
AddIfNew(DocList, Source);
Match := Match.NextMatch;
end;
end;
if FSearchAudio then
begin
RegexAudio := TRegEx.Create
('(?:
Scrapix
Un « aspirateur » web (web crawler) simple orienté VCL qui gère : la récupération HTTP, l'extraction de liens et de ressources
Scrapix
Un « aspirateur » web (web crawler) simple orienté VCL qui gère : la récupération HTTP, l'extraction de liens et de ressources
Le , par XeGregory