Download AbotX 2.1.12 Ultimate

AbotX 2.1.12 Ultimate

AbotX 2.1.12 Ultimate

A powerful C# web crawler that makes advanced crawling features easy to use. AbotX builds upon the open source Abot C# Web Crawler by providing a powerful set of wrappers and extensions.

Crawl multiple sites concurrently
Pause/resume live crawls
Render jаvascript before processing
Simplified pluggability/extensibility
Avoid getting blocked by sites
Automatically tune speed/concurrency

Parallel Crawler Engine
A crawler instance can crawl a single site quickly. However, if you have to crawl 10,000 sites quickly you need the ParallelCrawlerEngine. It allows you to crawl a configurable number of sites concurrently to maximize throughput.

Pause And Resume
There may be times when you need to temporarily pause a crawl to clear disk space on the machine or run a resource intensive utility. No matter the reason, you can confidently Pause and Resume the crawler and it will continue on like nothing happened.

Auto Throttling
Most websites you crawl cannot or will not handle the load of a web crawler. Auto Throttling automatically slows down the crawl speed if the website being crawled is showing signs of stress or unwillingness to respond to the frequency of http requests.

Easy Override
Easy Override allows you to easily plugin in any implementation of a key interface in an easy to use object wrapper that handles nested dependencies for you. No matter how deep.

jаvascript Rendering
Many web pages on the internet today use jаvascript to create the final page rendering. Most web crawlers do not render the jаvascript but instead just process the raw html sent back by the server. Use this feature to render jаvascript before processing.

Auto Tuning
Its difficult to predict what your machine can handle when the sites you will crawl/process all require different levels of machine resources. Auto tuning automatically monitors the host machine's resource usage and adjusts the crawl speed and concurrency to maximize throughput without overrunning it.

Using AbotX

using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using Abot2;
using AbotX2.Crawler;
using AbotX2.Parallel;
using AbotX2.Poco;
using Serilog;

namespace AbotX2.Demo
{
    class Program
    {
        static async Task Main(string[] args)
        {
            //Use Serilog to log
            Log.Logger = new LoggerConfiguration()
                .MinimumLevel.Information()
                .Enrich.WithThreadId()
                .WriteTo.Console(outputTemplate: Constants.LogFormatTemplate)
                .CreateLogger();

            var siteToCrawl = new Uri("YourSiteHere");

            //Uncomment to demo major features
            //await DemoCrawlerX_PauseResumeStop(siteToCrawl);
            //await DemoCrawlerX_JavascriptRendering(siteToCrawl);
            //await DemoCrawlerX_AutoTuning(siteToCrawl);
            //await DemoCrawlerX_Throttling(siteToCrawl);
            //await DemoParallelCrawlerEngine();
        }

        private static async Task DemoCrawlerX_PauseResumeStop(Uri siteToCrawl)
        {
            using (var crawler = new CrawlerX(GetSafeConfig()))
            {
                crawler.PageCrawlCompleted += (sender, args) =>
                {
                    //Check out args.CrawledPage for any info you need
                };
                var crawlTask = crawler.CrawlAsync(siteToCrawl);

                crawler.Pause();    //Suspend all operations

                Thread.Sleep(7000);

                crawler.Resume();   //Resume as if nothing happened

                crawler.Stop(true); //Stop or abort the crawl

                await crawlTask;
            }
        }

        private static async Task DemoCrawlerX_JavascriptRendering(Uri siteToCrawl)
        {
            var pathToPhantomJSExeFolder = @"[YourNugetPackagesLocationAbsolutePath]\PhantomJS.2.1.1\tools\phantomjs]";
            var config = new CrawlConfigurationX
            {
                IsJavascriptRenderingEnabled = true,
                JavascriptRendererPath = pathToPhantomJSExeFolder,
                IsSendingCookiesEnabled = true,
                MaxConcurrentThreads = 1,
                MaxPagesToCrawl = 1,
                JavascriptRenderingWaitTimeInMilliseconds = 3000,
                CrawlTimeoutSeconds = 20
            };

            using (var crawler = new CrawlerX(config))
            {
                crawler.PageCrawlCompleted += (sender, args) =>
                {
                    //JS should be fully rendered here args.CrawledPage.Content.Text
                };

                await crawler.CrawlAsync(siteToCrawl);
            }
        }

        private static async Task DemoCrawlerX_AutoTuning(Uri siteToCrawl)
        {
            var config = GetSafeConfig();
            config.AutoTuning = new AutoTuningConfig
            {
                IsEnabled = true,
                CpuThresholdHigh = 85,
                CpuThresholdMed = 65,
                MinAdjustmentWaitTimeInSecs = 10
            };
            //Optional, configure how aggressively to speed up or down during throttling
            config.Accelerator = new AcceleratorConfig();
            config.Decelerator = new DeceleratorConfig();

            //Now the crawl is able to "AutoTune" itself if the host machine
            //is showing signs of stress.
            using (var crawler = new CrawlerX(config))
            {
                crawler.PageCrawlCompleted += (sender, args) =>
                {
                    //Check out args.CrawledPage for any info you need
                };
                await crawler.CrawlAsync(siteToCrawl);
            }
        }

        private static async Task DemoCrawlerX_Throttling(Uri siteToCrawl)
        {
            var config = GetSafeConfig();
            config.AutoThrottling = new AutoThrottlingConfig
            {
                IsEnabled = true,
                ThresholdHigh = 2,
                ThresholdMed = 2,
                MinAdjustmentWaitTimeInSecs = 10
            };
            //Optional, configure how aggressively to speed up or down during throttling
            config.Accelerator = new AcceleratorConfig();
            config.Decelerator = new DeceleratorConfig();

            //Now the crawl is able to "Throttle" itself if the site being crawled
            //is showing signs of stress.
            using (var crawler = new CrawlerX(config))
            {
                crawler.PageCrawlCompleted += (sender, args) =>
                {
                    //Check out args.CrawledPage for any info you need
                };
                await crawler.CrawlAsync(siteToCrawl);
            }
        }

        private static async Task DemoParallelCrawlerEngine()
        {
            var siteToCrawlProvider = new SiteToCrawlProvider();
            siteToCrawlProvider.AddSitesToCrawl(new List<SiteToCrawl>
            {
                new SiteToCrawl{ Uri = new Uri("YOURSITE1") },
                new SiteToCrawl{ Uri = new Uri("YOURSITE2") },
                new SiteToCrawl{ Uri = new Uri("YOURSITE3") },
                new SiteToCrawl{ Uri = new Uri("YOURSITE4") },
                new SiteToCrawl{ Uri = new Uri("YOURSITE5") }
            });

            var config = GetSafeConfig();
            config.MaxConcurrentSiteCrawls = 3;
                
            var crawlEngine = new ParallelCrawlerEngine(
                config, 
                new ParallelImplementationOverride(config, 
                    new ParallelImplementationContainer()
                    {
                        SiteToCrawlProvider = siteToCrawlProvider,
                        WebCrawlerFactory = new WebCrawlerFactory(config)//Same config will be used for every crawler
                    })
                );                
            
            var crawlCounts = new Dictionary<Guid, int>();
            var siteStartingEvents = 0;
            var allSitesCompletedEvents = 0;
            crawlEngine.CrawlerInstanceCreated += (sender, eventArgs) =>
            {
                var crawlId = Guid.NewGuid();
                eventArgs.Crawler.CrawlBag.CrawlId = crawlId;
            };
            crawlEngine.SiteCrawlStarting += (sender, args) =>
            {
                Interlocked.Increment(ref siteStartingEvents);
            };
            crawlEngine.SiteCrawlCompleted += (sender, eventArgs) =>
            {
                lock (crawlCounts)
                {
                    crawlCounts.Add(eventArgs.CrawledSite.SiteToCrawl.Id, eventArgs.CrawledSite.CrawlResult.CrawlContext.CrawledCount);
                }
            };
            crawlEngine.AllCrawlsCompleted += (sender, eventArgs) =>
            {
                Interlocked.Increment(ref allSitesCompletedEvents);
            };

            await crawlEngine.StartAsync();
        }

        private static CrawlConfigurationX GetSafeConfig()
        {
            /*The following settings will help not get your ip banned
             by the sites you are trying to crawl. The idea is to crawl
             only 5 pages and wait 2 seconds between http requests
             */
            return new CrawlConfigurationX
            {
                MaxPagesToCrawl = 10,
                MinCrawlDelayPerDomainMilliSeconds = 2000
            };
        }
    }
}

Only for V.I.P

Warning! You are not allowed to view this text.

DotNet

iamDeveloper from 31-12-2022, 10:42