Build SEO site Analyzer usingPython
Background
When I analyze a website to get some data reports from a web target, I need to create a real-time scraper that is used to analyze and predict keywords and count the word from a web target, so I created some real-time analysis tools using python and flask micro-framework
Project Setup
First, install the poetry package manager to manage the python package, lets start to setup the project
Create Virtual Environment
create virtual environment
python -m venv venv
venv\Scripts\activate # on windows
source venv/bin/activate # on mac or linux
Install Required Library
pip install poetry
Required Library
install the required library below
- requests
- pyseoanalyzer
- Beautifulsoup4
- flask
- celery
- backoff
Create the core
First, create the core of the project foundation like this
# src/core/factory.pyimport osfrom flask import Flaskfrom core import init_celeryfrom apps.home.views import home_bpdef create_app(**kwargs) -> Flask: app: Flask = Flask(__name__) # environtment env = os.environ.get("MODE") if env == "development": print("Run In Development Mode") app.config.from_object("core.config.DevConfig") else: print("Run In Production Mode") app.config.from_object("core.config.ProdConfig") if kwargs.get("celery"): init_celery(app=app, celery=kwargs.get("celery")) # initiate blueprint app.register_blueprint(home_bp) return app
Setup celery config
next step is setup celery config
# src/core/__init__.pyimport osfrom celery import Celeryfrom flask import Flaskfrom typing import Anydef celery_app(app_name=__name__) -> Celery: """create celery object to make a module of celery task Args: app (None): celery app object Returns: Celery: return configured Celery Objects """ if os.environ.get("MODE") == "development": print("Broker development") redis_broker = os.environ.get("CELERY_BROKER_URL_DEV") redis_backend = os.environ.get("CELERY_RESULT_BACKEND_DEV") return Celery(app_name, backend=redis_backend, redis_broker=redis_broker) else: print("Broker Production") redis_broker = os.environ.get("CELERY_BROKER_URL") redis_backend = os.environ.get("CELERY_RESULT_BACKEND") return Celery(app_name, backend=redis_backend, redis_broker=redis_broker)def init_celery(celery: Celery, app: Flask): """Adding Flask to Celery Support Args: app (Flask): Flask Object Returns: Celery: Celery Object """ app.config.update({ 'broker_url': os.environ.get("CELERY_BROKER_URL"), "result_backend": os.environ.get("CELERY_RESULT_BACKEND") }) celery.conf.update(app.config) TaskBase = celery.Task class ContextTask(TaskBase): abstract = True def __call__(self, *args: Any, **kwargs: Any) -> Any: with app.app_context(): return TaskBase.__call__(self, *args, **kwargs) celery.Task = ContextTask return celerycelery_ext = celery_app()
after create the celery extension let's build the config
for config the project
# src/core/config.py import osfrom typing import Literalfrom pathlib import Pathfrom dotenv import dotenv_valuesdotenv_path = os.path.join(Path(__file__).resolve().parent.parent.parent, '.env')tes = dotenv_values(dotenv_path)class BaseConfig: SECRET_KEY: Literal = os.environ.get("SECRET_KEY")class DevConfig(BaseConfig): DEBUG: bool = True FLASK_DEBUG: bool =Trueclass ProdConfig(BaseConfig): DEBUG: bool = False broker_url: str = os.environ.get("CELERY_BROKER_URL") RESULT_BACKEND = os.environ.get("CELERY_RESULT_BACKEND")
Build Scraper Module
in this case we need to rebuild scraper module to build a custom results for our API
Building Crawler Instance
from seoanalyzer.website import Websitefrom xml.dom import minidomfrom modules.scrape import PageCrawlerfrom modules.helper import httpclass WebCrawler(Website): def __init__(self, base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links, scrape_img): super().__init__(base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links) self.scrape_img = scrape_img def crawl(self): if self.sitemap: page = http.get(self.sitemap) if self.sitemap.endswith('xml'): xmldoc = minidom.parseString(page.data.decode('utf-8')) sitemap_urls = xmldoc.getElementsByTagName('loc') for url in sitemap_urls: self.page_queue.append(self.get_text_from_xml(url.childNodes)) elif self.sitemap.endswith('txt'): sitemap_urls = page.data.decode('utf-8').split('
') for url in sitemap_urls: self.page_queue.append(url) self.page_queue.append(self.base_url) for url in self.page_queue: if url in self.crawled_urls: print("Crawled URL: {}".format(url)) continue else: print("process URL to crawl: {}".format(url)) page = PageCrawler(url=url, base_url=self.base_url, analyze_headings=self.analyze_headings, analyze_extra_tags=self.analyze_extra_tags, scrape_img=self.scrape_img) if page.parsed_url.netloc != page.base_domain.netloc: continue page.analyze() self.content_hashes[page.content_hash].add(page.url) for w in page.wordcount: self.wordcount[w] += page.wordcount[w] for b in page.bigrams: self.bigrams[b] += page.bigrams[b] for t in page.trigrams: self.trigrams[t] += page.trigrams[t] self.page_queue.extend(page.links) self.crawled_pages.append(page) self.crawled_urls.add(page.url) if not self.follow_links: break if not self.scrape_img: break
and build a page crawler to analyze site per page
# src/modules/scrape.py# file to process pageimport reimport hashlibimport requestsfrom seoanalyzer.page import Pagefrom bs4 import BeautifulSoupfrom urllib3.exceptions import HTTPError, MaxRetryErrorfrom urllib.parse import urlsplitfrom typing import Anyfrom modules.helper import httpclass PageCrawler(Page): def __init__( self, url: str, base_url: str, analyze_headings: bool = False, analyze_extra_tags: bool = False, scrape_img: bool = False, ) -> None: super().__init__(url, base_url, analyze_headings, analyze_extra_tags) self.scrape_img: bool = scrape_img self.analyze_headings: bool = analyze_headings self.analyze_extra_tags: bool = analyze_extra_tags # adding broken and internal link self.internal_links: list = [] self.broken_link: list = [] self.external_link: list = [] if self.scrape_img: self.images = [] if self.analyze_headings: self.headings = {} if analyze_extra_tags: self.additional_info = {} def scrape_external_link(self, bs: BeautifulSoup): external_link = bs.find_all("a") hostname = urlsplit(self.url).hostname for link in external_link: valid_link = link.get("href") if valid_link.startswith("http://") or valid_link.startswith("https://"): if urlsplit(valid_link).hostname != hostname: self.external_link.append(valid_link) return self.external_link def scrape_broken_link(self, bs: BeautifulSoup) -> list: broken_link = bs.find_all("a") for broke in broken_link: valid_url: str = broke.get("href") if valid_url.startswith("http://") or valid_url.startswith("https://"): try: broken = http.get(valid_url) except MaxRetryError: broken = http.get_without_redirect(valid_url) try: if broken.status == 404: self.broken_link.append(valid_url) except: if broken.status_code == 404: self.broken_link.append(valid_url) else: continue return self.broken_link def scrape_internal_link(self, bs: BeautifulSoup) -> list: internal_link = bs.find_all("a") hostname = urlsplit(self.url).hostname for link in internal_link: valid_link = link.get("href") if valid_link.startswith("http://") or valid_link.startswith("https://"): if urlsplit(valid_link).hostname == hostname: self.internal_links.append(valid_link) return self.internal_links def scrape_image(self, bs: BeautifulSoup) -> list: """scrape images data Args: bs (BeautifulSoup): beautifulsoup object Returns: list: return list of images """ images = bs.find_all("img") for img in images: source = img.get("alt") self.images.append(source) return self.images def talk(self) -> dict[str, Any]: """Returns a dictionary that can be printed Returns: dict: dictionary context data that can be printed """ context: dict = { "url": self.url, "title": self.title, "description": self.description, "word_count": self.total_word_count, "keywords": self.sort_freq_dist(self.keywords, limit=5), "bigrams": self.bigrams, "trigrams": self.trigrams, "broken links": self.broken_link, "internal links": self.internal_links, "external links": self.external_link, "warnings": self.warnings, "content_hash": self.content_hash, } # append new items if self.analyze_headings: context["headings"] = self.headings if self.analyze_extra_tags: context["additional_info"] = self.additional_info if self.scrape_img: context["images"] = self.images return context def analyze(self, raw_html=None): """ Analyze the page and populate the warnings list """ if not raw_html: valid_prefixes = [] # only allow http:// https:// and // for s in [ "http://", "https://", "//", ]: valid_prefixes.append(self.url.startswith(s)) if True not in valid_prefixes: self.warn(f"{self.url} does not appear to have a valid protocol.") return if self.url.startswith("//"): self.url = f"{self.base_domain.scheme}:{self.url}" if self.parsed_url.netloc != self.base_domain.netloc: self.warn(f"{self.url} is not part of {self.base_domain.netloc}.") return try: page = http.get(self.url) except HTTPError as e: self.warn(f"Returned {e}") return encoding = "ascii" if "content-type" in page.headers: encoding = page.headers["content-type"].split("charset=")[-1] if encoding.lower() not in ("text/html", "text/plain", "utf-8"): self.warn(f"Can not read {encoding}") return else: raw_html = page.data.decode("utf-8") self.content_hash = hashlib.sha1(raw_html.encode("utf-8")).hexdigest() # remove comments, they screw with BeautifulSoup clean_html = re.sub(r"<!--.*?-->", r"", raw_html, flags=re.DOTALL) soup_lower = BeautifulSoup( clean_html.lower(), "html.parser" ) # .encode('utf-8') soup_unmodified = BeautifulSoup(clean_html, "html.parser") # .encode('utf-8') texts = soup_lower.findAll(text=True) visible_text = [w for w in filter(self.visible_tags, texts)] self.process_text(visible_text) self.populate(soup_lower) self.analyze_title() self.analyze_description() self.analyze_og(soup_lower) self.analyze_a_tags(soup_unmodified) self.analyze_img_tags(soup_lower) self.analyze_h1_tags(soup_lower) # add broken and alt image text wrapper self.scrape_broken_link(soup_unmodified) self.scrape_internal_link(soup_unmodified) self.scrape_external_link(soup_unmodified) if self.analyze_headings: self.analyze_heading_tags(soup_unmodified) if self.analyze_extra_tags: self.analyze_additional_tags(soup_unmodified) # add scrape image function if self.scrape_img: self.scrape_image(soup_unmodified) return True
Build Analyzer Instance
to run the scraper module let's build the analyzer instance to combine a custom module
# src/modules/analyzer.pyimport timefrom typing import Anyfrom operator import itemgetterfrom modules.crawler import WebCrawlerdef analyze(url, sitemap_url=None, analyze_headings=False, analyze_extra_tags=False, follow_links=True, scrape_img=True) -> dict[str, Any]: start_time = time.time() def calc_total_time(): return time.time() - start_time output = {'pages': [], 'keywords': [], 'errors': [], 'total_time': calc_total_time()} site = WebCrawler(base_url=url, sitemap=sitemap_url, analyze_headings=analyze_headings, analyze_extra_tags=analyze_extra_tags, follow_links=follow_links, scrape_img=scrape_img) site.crawl() for p in site.crawled_pages: output['pages'].append(p.talk()) output['duplicate_pages'] = [list(site.content_hashes[p]) for p in site.content_hashes if len(site.content_hashes[p]) > 1] sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True) sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True) sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True) output['keywords'] = [] for w in sorted_words: if w[1] > 4: output['keywords'].append({ 'word': w[0], 'count': w[1], }) for w, v in sorted_bigrams: if v > 4: output['keywords'].append({ 'word': w, 'count': v, }) for w, v in sorted_trigrams: if v > 4: output['keywords'].append({ 'word': w, 'count': v, }) # Sort one last time... output['keywords'] = sorted(output['keywords'], key=itemgetter('count'), reverse=True) output['total_time'] = calc_total_time() return output
Adding Helper Modules
add this helper.py
to create utility for scraper module
# src/modules/helper.pyimport certifiimport requestsfrom urllib3 import PoolManagerfrom urllib3 import Timeoutfrom urllib3.response import HTTPResponsefrom urllib3.util import Retryfrom typing import Anyclass Http: def __init__(self): user_agent: dict[str, str] = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" } self.retry_strategy: Retry = Retry( total=10, # maximum number of retries redirect=100, # maximum number of redirects to follow status_forcelist=[ 429, 500, 502, 503, 504, ], # list of HTTP status codes to retry on method_whitelist=[ "HEAD", "TRACE", "GET", "PUT", "OPTIONS", "DELETE", ], # list of HTTP methods to retry on ) self.http: PoolManager = PoolManager( timeout=Timeout(connect=1.0, read=2.0), cert_reqs="CERT_REQUIRED", ca_certs=certifi.where(), headers=user_agent, retries=self.retry_strategy, ) def get(self, url) -> Any: return self.http.request("GET", url) def get_without_redirect(self, url: str) -> HTTPResponse: res = requests.get(url, verify=False) return res def post(self, url: str, data: dict[str, Any]) -> Any: return self.http.request("POST", url, fields=data) def put(self, url: str, data: dict[str, Any]) -> HTTPResponse: return self.http.request("PUT", url, fields=data) def delete(self, url: str) -> HTTPResponse: return self.http.request("DELETE", url)http = Http()
and well done the scraper module is ready to use, with reusable custom function, let's continue to the next step let's build the web app using flask
Build Web Apps Using Flask
In the previous step we already setup the flask aplication, so in this step we build the routes and views the webapps
# src/apps/home/views.pyimport json# from seoanalyzer import analyzefrom flask import Blueprint, render_template, request, jsonifyfrom typing import Anyfrom modules.analyzer import analyzehome_bp: Blueprint = Blueprint( "home", __name__, template_folder="templates", static_folder="static")@home_bp.route("/", methods=["GET", "POST"])def index(): if request.method == "POST": query = request.form.get("website") analyzer = analyze( url=query, follow_links=False, ) results: list = [dict[str, Any]] # process pages here for page in analyzer["pages"]: data_dict: dict[str, Any] = { "word count": page["word_count"], "page title": page["title"], } results.append(data_dict) for keywords in page["keywords"]: print(keywords) # (['pages', 'keywords', 'errors', 'total_time', 'duplicate_pages'] # return json return render_template("index.html", datas=results) return render_template("index.html")
Design The Templates
after design the views, let's create a template using bootstrap in index.html
thats created at src/apps/home/templates
directory
<!doctype html><html lang="en"> <head> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>SEO Audit Tools</title> <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous"> </head> <body> <div class="container justify-content-center mx-auto"> <h1 class="text-center">Seo Audit Tools</h1> <form action="" method="post"> <div class="input-group"> <input name="website" type="search" class="form-control rounded" placeholder="example.com" aria-label="Search" aria-describedby="search-addon" /> <button type="submit" class="btn btn-warning text-white">Audit</button> </div> </form> </div> <div class="container"> <table class="table"> <thead> <tr> <th scope="col">No.</th> <th scope="col">Word Count</th> <th scope="col">Page Title</th> <th scope="col">Meta Description</th> <th scope="col">Details Page</th> </tr> </thead> <tbody> {% for data in datas %} <tr> <th scope="row">{{ loop.index }}</th> <td>{{ data['word count'] }}</td> <td>{{ data['page title'] }}</td> <td>{{ data['meta discription'] }}</td> <td><a class="btn btn-outline-primary btn-sm" href="">See Details</a></td> </tr> </tbody> {% endfor %} </table> </div> <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-kenU1KFdBIe4zVF0s0G1M5b4hcpxyD9F7jL+jjXkk+Q2h455rYXK/7HAuoJl+0I4" crossorigin="anonymous"></script> </body></html>
Conclusion
this post discuss about how create custom modules based on third party module on python, this project is also support on docker, please check it out on github, for more amazing project, good luck!
Original Link: https://dev.to/arifrhm/build-seo-site-analyzer-using-python-3g2n
Dev To
An online community for sharing and discovering great ideas, having debates, and making friendsMore About this Source Visit Dev To