Your Web News in One Place

Help Webnuz

Referal links:

Sign up for GreenGeeks web hosting
January 29, 2023 04:43 pm GMT

Build SEO site Analyzer usingPython

Background

When I analyze a website to get some data reports from a web target, I need to create a real-time scraper that is used to analyze and predict keywords and count the word from a web target, so I created some real-time analysis tools using python and flask micro-framework

Project Setup

First, install the poetry package manager to manage the python package, lets start to setup the project

Create Virtual Environment

create virtual environment

 python -m venv venv
 venv\Scripts\activate # on windows
source venv/bin/activate # on mac or linux

Install Required Library

pip install poetry

Required Library

install the required library below

  • requests
  • pyseoanalyzer
  • Beautifulsoup4
  • flask
  • celery
  • backoff

Create the core

First, create the core of the project foundation like this

# src/core/factory.pyimport osfrom flask import Flaskfrom core import init_celeryfrom apps.home.views import home_bpdef create_app(**kwargs) -> Flask:    app: Flask = Flask(__name__)    # environtment    env = os.environ.get("MODE")    if env == "development":        print("Run In Development Mode")        app.config.from_object("core.config.DevConfig")    else:        print("Run In Production Mode")        app.config.from_object("core.config.ProdConfig")        if kwargs.get("celery"):            init_celery(app=app, celery=kwargs.get("celery"))    # initiate blueprint    app.register_blueprint(home_bp)    return app

Setup celery config

next step is setup celery config

# src/core/__init__.pyimport osfrom celery import Celeryfrom flask import Flaskfrom typing import Anydef celery_app(app_name=__name__) -> Celery:    """create celery object to make a module of celery task    Args:        app (None): celery app object    Returns:        Celery: return configured Celery Objects    """    if os.environ.get("MODE") == "development":        print("Broker development")        redis_broker = os.environ.get("CELERY_BROKER_URL_DEV")        redis_backend = os.environ.get("CELERY_RESULT_BACKEND_DEV")        return Celery(app_name, backend=redis_backend, redis_broker=redis_broker)    else:        print("Broker Production")        redis_broker = os.environ.get("CELERY_BROKER_URL")        redis_backend = os.environ.get("CELERY_RESULT_BACKEND")        return Celery(app_name, backend=redis_backend, redis_broker=redis_broker)def init_celery(celery: Celery, app: Flask):    """Adding Flask to Celery Support    Args:        app (Flask): Flask Object    Returns:        Celery: Celery Object    """    app.config.update({        'broker_url': os.environ.get("CELERY_BROKER_URL"),        "result_backend": os.environ.get("CELERY_RESULT_BACKEND")    })    celery.conf.update(app.config)    TaskBase = celery.Task    class ContextTask(TaskBase):        abstract = True        def __call__(self, *args: Any, **kwargs: Any) -> Any:            with app.app_context():                return TaskBase.__call__(self, *args, **kwargs)    celery.Task = ContextTask    return celerycelery_ext = celery_app()

after create the celery extension let's build the config for config the project

# src/core/config.py import osfrom typing import Literalfrom pathlib import Pathfrom dotenv import dotenv_valuesdotenv_path = os.path.join(Path(__file__).resolve().parent.parent.parent, '.env')tes = dotenv_values(dotenv_path)class BaseConfig:    SECRET_KEY: Literal = os.environ.get("SECRET_KEY")class DevConfig(BaseConfig):    DEBUG: bool = True    FLASK_DEBUG: bool =Trueclass ProdConfig(BaseConfig):    DEBUG: bool = False    broker_url: str = os.environ.get("CELERY_BROKER_URL")    RESULT_BACKEND = os.environ.get("CELERY_RESULT_BACKEND")

Build Scraper Module

in this case we need to rebuild scraper module to build a custom results for our API

Building Crawler Instance

from seoanalyzer.website import Websitefrom xml.dom import minidomfrom modules.scrape import PageCrawlerfrom modules.helper import httpclass WebCrawler(Website):    def __init__(self, base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links, scrape_img):        super().__init__(base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links)        self.scrape_img = scrape_img    def crawl(self):        if self.sitemap:            page = http.get(self.sitemap)            if self.sitemap.endswith('xml'):                xmldoc = minidom.parseString(page.data.decode('utf-8'))                sitemap_urls = xmldoc.getElementsByTagName('loc')                for url in sitemap_urls:                    self.page_queue.append(self.get_text_from_xml(url.childNodes))            elif self.sitemap.endswith('txt'):                sitemap_urls = page.data.decode('utf-8').split('
') for url in sitemap_urls: self.page_queue.append(url) self.page_queue.append(self.base_url) for url in self.page_queue: if url in self.crawled_urls: print("Crawled URL: {}".format(url)) continue else: print("process URL to crawl: {}".format(url)) page = PageCrawler(url=url, base_url=self.base_url, analyze_headings=self.analyze_headings, analyze_extra_tags=self.analyze_extra_tags, scrape_img=self.scrape_img) if page.parsed_url.netloc != page.base_domain.netloc: continue page.analyze() self.content_hashes[page.content_hash].add(page.url) for w in page.wordcount: self.wordcount[w] += page.wordcount[w] for b in page.bigrams: self.bigrams[b] += page.bigrams[b] for t in page.trigrams: self.trigrams[t] += page.trigrams[t] self.page_queue.extend(page.links) self.crawled_pages.append(page) self.crawled_urls.add(page.url) if not self.follow_links: break if not self.scrape_img: break

and build a page crawler to analyze site per page

# src/modules/scrape.py# file to process pageimport reimport hashlibimport requestsfrom seoanalyzer.page import Pagefrom bs4 import BeautifulSoupfrom urllib3.exceptions import HTTPError, MaxRetryErrorfrom urllib.parse import urlsplitfrom typing import Anyfrom modules.helper import httpclass PageCrawler(Page):    def __init__(        self,        url: str,        base_url: str,        analyze_headings: bool = False,        analyze_extra_tags: bool = False,        scrape_img: bool = False,    ) -> None:        super().__init__(url, base_url, analyze_headings, analyze_extra_tags)        self.scrape_img: bool = scrape_img        self.analyze_headings: bool = analyze_headings        self.analyze_extra_tags: bool = analyze_extra_tags        # adding broken and internal link        self.internal_links: list = []        self.broken_link: list = []        self.external_link: list = []        if self.scrape_img:            self.images = []        if self.analyze_headings:            self.headings = {}        if analyze_extra_tags:            self.additional_info = {}    def scrape_external_link(self, bs: BeautifulSoup):        external_link = bs.find_all("a")        hostname = urlsplit(self.url).hostname        for link in external_link:            valid_link = link.get("href")            if valid_link.startswith("http://") or valid_link.startswith("https://"):                if urlsplit(valid_link).hostname != hostname:                    self.external_link.append(valid_link)        return self.external_link    def scrape_broken_link(self, bs: BeautifulSoup) -> list:        broken_link = bs.find_all("a")        for broke in broken_link:            valid_url: str = broke.get("href")            if valid_url.startswith("http://") or valid_url.startswith("https://"):                try:                    broken = http.get(valid_url)                except MaxRetryError:                    broken = http.get_without_redirect(valid_url)                try:                    if broken.status == 404:                        self.broken_link.append(valid_url)                except:                    if broken.status_code == 404:                        self.broken_link.append(valid_url)            else:                continue        return self.broken_link    def scrape_internal_link(self, bs: BeautifulSoup) -> list:        internal_link = bs.find_all("a")        hostname = urlsplit(self.url).hostname        for link in internal_link:            valid_link = link.get("href")            if valid_link.startswith("http://") or valid_link.startswith("https://"):                if urlsplit(valid_link).hostname == hostname:                    self.internal_links.append(valid_link)        return self.internal_links    def scrape_image(self, bs: BeautifulSoup) -> list:        """scrape images data        Args:            bs (BeautifulSoup): beautifulsoup object        Returns:            list: return list of images        """        images = bs.find_all("img")        for img in images:            source = img.get("alt")            self.images.append(source)        return self.images    def talk(self) -> dict[str, Any]:        """Returns a dictionary that can be printed        Returns:            dict: dictionary context data that can be printed        """        context: dict = {            "url": self.url,            "title": self.title,            "description": self.description,            "word_count": self.total_word_count,            "keywords": self.sort_freq_dist(self.keywords, limit=5),            "bigrams": self.bigrams,            "trigrams": self.trigrams,            "broken links": self.broken_link,            "internal links": self.internal_links,            "external links": self.external_link,            "warnings": self.warnings,            "content_hash": self.content_hash,        }        # append new items        if self.analyze_headings:            context["headings"] = self.headings        if self.analyze_extra_tags:            context["additional_info"] = self.additional_info        if self.scrape_img:            context["images"] = self.images        return context    def analyze(self, raw_html=None):        """        Analyze the page and populate the warnings list        """        if not raw_html:            valid_prefixes = []            # only allow http:// https:// and //            for s in [                "http://",                "https://",                "//",            ]:                valid_prefixes.append(self.url.startswith(s))            if True not in valid_prefixes:                self.warn(f"{self.url} does not appear to have a valid protocol.")                return            if self.url.startswith("//"):                self.url = f"{self.base_domain.scheme}:{self.url}"            if self.parsed_url.netloc != self.base_domain.netloc:                self.warn(f"{self.url} is not part of {self.base_domain.netloc}.")                return            try:                page = http.get(self.url)            except HTTPError as e:                self.warn(f"Returned {e}")                return            encoding = "ascii"            if "content-type" in page.headers:                encoding = page.headers["content-type"].split("charset=")[-1]            if encoding.lower() not in ("text/html", "text/plain", "utf-8"):                self.warn(f"Can not read {encoding}")                return            else:                raw_html = page.data.decode("utf-8")        self.content_hash = hashlib.sha1(raw_html.encode("utf-8")).hexdigest()        # remove comments, they screw with BeautifulSoup        clean_html = re.sub(r"<!--.*?-->", r"", raw_html, flags=re.DOTALL)        soup_lower = BeautifulSoup(            clean_html.lower(), "html.parser"        )  # .encode('utf-8')        soup_unmodified = BeautifulSoup(clean_html, "html.parser")  # .encode('utf-8')        texts = soup_lower.findAll(text=True)        visible_text = [w for w in filter(self.visible_tags, texts)]        self.process_text(visible_text)        self.populate(soup_lower)        self.analyze_title()        self.analyze_description()        self.analyze_og(soup_lower)        self.analyze_a_tags(soup_unmodified)        self.analyze_img_tags(soup_lower)        self.analyze_h1_tags(soup_lower)        # add broken and alt image text wrapper        self.scrape_broken_link(soup_unmodified)        self.scrape_internal_link(soup_unmodified)        self.scrape_external_link(soup_unmodified)        if self.analyze_headings:            self.analyze_heading_tags(soup_unmodified)        if self.analyze_extra_tags:            self.analyze_additional_tags(soup_unmodified)        # add scrape image function        if self.scrape_img:            self.scrape_image(soup_unmodified)        return True

Build Analyzer Instance

to run the scraper module let's build the analyzer instance to combine a custom module

# src/modules/analyzer.pyimport timefrom typing import Anyfrom operator import itemgetterfrom modules.crawler import WebCrawlerdef analyze(url, sitemap_url=None, analyze_headings=False, analyze_extra_tags=False, follow_links=True, scrape_img=True) -> dict[str, Any]:    start_time = time.time()    def calc_total_time():        return time.time() - start_time    output = {'pages': [], 'keywords': [], 'errors': [], 'total_time': calc_total_time()}    site = WebCrawler(base_url=url, sitemap=sitemap_url, analyze_headings=analyze_headings, analyze_extra_tags=analyze_extra_tags, follow_links=follow_links, scrape_img=scrape_img)    site.crawl()    for p in site.crawled_pages:        output['pages'].append(p.talk())    output['duplicate_pages'] = [list(site.content_hashes[p]) for p in site.content_hashes if len(site.content_hashes[p]) > 1]    sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)    sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)    sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)    output['keywords'] = []    for w in sorted_words:        if w[1] > 4:            output['keywords'].append({                'word': w[0],                'count': w[1],            })    for w, v in sorted_bigrams:        if v > 4:            output['keywords'].append({                'word': w,                'count': v,            })    for w, v in sorted_trigrams:        if v > 4:            output['keywords'].append({                'word': w,                'count': v,            })    # Sort one last time...    output['keywords'] = sorted(output['keywords'], key=itemgetter('count'), reverse=True)    output['total_time'] = calc_total_time()    return output

Adding Helper Modules

add this helper.py to create utility for scraper module

# src/modules/helper.pyimport certifiimport requestsfrom urllib3 import PoolManagerfrom urllib3 import Timeoutfrom urllib3.response import HTTPResponsefrom urllib3.util import Retryfrom typing import Anyclass Http:    def __init__(self):        user_agent: dict[str, str] = {            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"        }        self.retry_strategy: Retry = Retry(            total=10,  # maximum number of retries            redirect=100,  # maximum number of redirects to follow            status_forcelist=[                429,                500,                502,                503,                504,            ],  # list of HTTP status codes to retry on            method_whitelist=[                "HEAD",                "TRACE",                "GET",                "PUT",                "OPTIONS",                "DELETE",            ],  # list of HTTP methods to retry on        )        self.http: PoolManager = PoolManager(            timeout=Timeout(connect=1.0, read=2.0),            cert_reqs="CERT_REQUIRED",            ca_certs=certifi.where(),            headers=user_agent,            retries=self.retry_strategy,        )    def get(self, url) -> Any:        return self.http.request("GET", url)    def get_without_redirect(self, url: str) -> HTTPResponse:        res = requests.get(url, verify=False)        return res    def post(self, url: str, data: dict[str, Any]) -> Any:        return self.http.request("POST", url, fields=data)    def put(self, url: str, data: dict[str, Any]) -> HTTPResponse:        return self.http.request("PUT", url, fields=data)    def delete(self, url: str) -> HTTPResponse:        return self.http.request("DELETE", url)http = Http()

and well done the scraper module is ready to use, with reusable custom function, let's continue to the next step let's build the web app using flask

Build Web Apps Using Flask

In the previous step we already setup the flask aplication, so in this step we build the routes and views the webapps

# src/apps/home/views.pyimport json# from seoanalyzer import analyzefrom flask import Blueprint, render_template, request, jsonifyfrom typing import Anyfrom modules.analyzer import analyzehome_bp: Blueprint = Blueprint(    "home", __name__, template_folder="templates", static_folder="static")@home_bp.route("/", methods=["GET", "POST"])def index():    if request.method == "POST":        query = request.form.get("website")        analyzer = analyze(            url=query,            follow_links=False,        )        results: list = [dict[str, Any]]        # process pages here        for page in analyzer["pages"]:            data_dict: dict[str, Any] = {                "word count": page["word_count"],                "page title": page["title"],            }            results.append(data_dict)            for keywords in page["keywords"]:                print(keywords)        # (['pages', 'keywords', 'errors', 'total_time', 'duplicate_pages']        # return json        return render_template("index.html", datas=results)    return render_template("index.html")

Design The Templates

after design the views, let's create a template using bootstrap in index.html thats created at src/apps/home/templates directory

<!doctype html><html lang="en">  <head>    <meta charset="utf-8">    <meta name="viewport" content="width=device-width, initial-scale=1">    <title>SEO Audit Tools</title>    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">  </head>  <body>    <div class="container justify-content-center mx-auto">        <h1 class="text-center">Seo Audit Tools</h1>        <form action="" method="post">            <div class="input-group">                <input name="website" type="search" class="form-control rounded" placeholder="example.com" aria-label="Search" aria-describedby="search-addon" />                <button type="submit" class="btn btn-warning text-white">Audit</button>            </div>        </form>    </div>    <div class="container">      <table class="table">        <thead>          <tr>            <th scope="col">No.</th>            <th scope="col">Word Count</th>            <th scope="col">Page Title</th>            <th scope="col">Meta Description</th>            <th scope="col">Details Page</th>          </tr>        </thead>        <tbody>          {% for data in datas %}          <tr>            <th scope="row">{{ loop.index }}</th>            <td>{{ data['word count'] }}</td>            <td>{{ data['page title'] }}</td>            <td>{{ data['meta discription'] }}</td>            <td><a class="btn btn-outline-primary btn-sm" href="">See Details</a></td>          </tr>        </tbody>          {% endfor %}      </table>    </div>    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-kenU1KFdBIe4zVF0s0G1M5b4hcpxyD9F7jL+jjXkk+Q2h455rYXK/7HAuoJl+0I4" crossorigin="anonymous"></script>  </body></html>

Image description

Conclusion

this post discuss about how create custom modules based on third party module on python, this project is also support on docker, please check it out on github, for more amazing project, good luck!


Original Link: https://dev.to/arifrhm/build-seo-site-analyzer-using-python-3g2n

Share this article:    Share on Facebook
View Full Article

Dev To

An online community for sharing and discovering great ideas, having debates, and making friends

More About this Source Visit Dev To