Hcrawler - Crawler em Python [Python]

Hcrawler - Crawler em Python

Publicado por henrique (última atualização em 24/07/2023)

[ Hits: 1.643 ]

Homepage: https://mrh-ofici.github.io/

Download hcrawler.py

2 0

Denuncie Favoritos Indicar

Um detector de links e emails em Python com funções extras

Esconder código-fonte

#!/bin/python3
import argparse, requests, re
from bs4 import BeautifulSoup
from colorama import Fore

print('-'* 65)
print(Fore.LIGHTRED_EX + """
        &#9604;&#9600;&#9600;&#9604; &#9604;&#9604;   &#9604;&#9600;&#9604;&#9604;&#9604;&#9604;   &#9604;&#9600;&#9600;&#9604;&#9600;&#9600;&#9600;&#9604;  &#9604;&#9600;&#9600;&#9608;&#9604;   &#9604;&#9600;&#9600;&#9604;    &#9604;&#9600;&#9600;&#9604;  &#9604;&#9600;&#9600;&#9600;&#9600;&#9604;     &#9604;&#9600;&#9600;&#9608;&#9604;&#9604;&#9604;&#9604;  &#9604;&#9600;&#9600;&#9604;&#9600;&#9600;&#9600;&#9604; 
        &#9608;  &#9608;   &#9604;&#9600; &#9608; &#9608;    &#9612; &#9608;   &#9608;   &#9608; &#9616; &#9604;&#9600; &#9600;&#9604; &#9608;   &#9608;    &#9616;  &#9608; &#9608;    &#9608;     &#9616;  &#9604;&#9600;   &#9616; &#9608;   &#9608;   &#9608; 
        &#9616;  &#9608;&#9604;&#9604;&#9604;&#9608;  &#9616; &#9608;      &#9616;  &#9608;&#9600;&#9600;&#9608;&#9600;    &#9608;&#9604;&#9604;&#9604;&#9608; &#9616;  &#9608;        &#9608; &#9616;    &#9608;       &#9608;&#9604;&#9604;&#9604;&#9604;&#9604;  &#9616;  &#9608;&#9600;&#9600;&#9608;&#9600;  
        &#9608;   &#9608;    &#9608;       &#9604;&#9600;    &#9608;   &#9604;&#9600;   &#9608;   &#9608;   &#9604;    &#9608;      &#9608;        &#9608;    &#9612;   &#9604;&#9600;    &#9608;  
        &#9604;&#9600;  &#9604;&#9600;   &#9604;&#9600;&#9604;&#9604;&#9604;&#9604;&#9600; &#9608;     &#9608;   &#9608;   &#9604;&#9600;     &#9600;&#9604;&#9600; &#9600;&#9604; &#9604;&#9600;    &#9604;&#9600;&#9604;&#9604;&#9604;&#9604;&#9604;&#9604;&#9600; &#9604;&#9600;&#9604;&#9604;&#9604;&#9604;   &#9608;     &#9608;   
        &#9608;   &#9608;    &#9608;     &#9616;  &#9616;     &#9616;   &#9616;   &#9616;            &#9600;      &#9608;         &#9608;    &#9616;   &#9616;     &#9616;   
        &#9616;   &#9616;    &#9616;                                          &#9616;         &#9616;                  
\n""" + Fore.LIGHTGREEN_EX)

parser = argparse.ArgumentParser(description='A detector de links and emails in websites', usage='./hcrawler.py -d domain -l False -o output.txt')
parser.add_argument('-d', '--domain', action='store', dest='hosts', help='domain', required=True)
parser.add_argument('-u', '--user-agent', action='store', dest='agent', help='user-agent', default="Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0")
parser.add_argument('--cookies', action='store', dest='cookie', help='Cookies')
parser.add_argument('--data', action='store', dest='data', help='{password : 123} with asps each element', type=hash)
parser.add_argument('-o', '--output', action='store', dest='output', help='Save files of output')
parser.add_argument('-l', '--links', action='store',dest='links',default='True', help='Mode links')
parser.add_argument('-e', '--email', dest='emails', help='Mode email', action = 'store', default='False')
parser.add_argument('-c', dest='cont', help='Just crawling the website and not crawling your links', default='True', action='store')
arguments = parser.parse_args()

EMAILS = []
TO_CRAWL = []
CRAWLED = set()
if arguments.output:
    print('[*] Mode output activated\n')
    file = open(arguments.output, 'w')

def request(url):
    header = {"User-Agent": arguments.agent}
    try:
        if arguments.cookie:
                header['Cookie'] = arguments.cookie
        if arguments.data:
            data = arguments.data
            response = requests.get(url, headers=header, data=data)
        
        else:
            response = requests.get(url, headers=header)
        return response.text
    except KeyboardInterrupt:
        print('-' * 65)
        exit()
    except:
        pass


def get_links(html):
    links = []
    try:
        soup = BeautifulSoup(html, "html.parser")
        tags_a = soup.find_all("a", href=True)
        for tag in tags_a:
            link = tag["href"]
            if link.startswith("http"):
                links.append(link)

        return links
    except:
        pass

def get_emails(html):
    emails = re.findall(r"\w[\w\.]+@\w[\w\.]+\w", html)
    return emails

def crawl():
    if arguments.links == 'True':
                    print('[*] Mode link activated\n')
    if arguments.emails == 'True':
                    print('[*] Mode email activated\n')
    while 1:
        if TO_CRAWL:
            url = TO_CRAWL.pop()

            html = request(url)
            if html:
                if arguments.links == 'True':
                    links = get_links(html)
                    if links:
                        for link in links:
                            if link not in CRAWLED and link not in TO_CRAWL:
                                TO_CRAWL.append(link)

                    print(f"[*] Crawling {url}\n")
                
                if arguments.emails == 'True':
                    emails = get_emails(html)
                    for email in emails:
                        if email not in EMAILS:
                            print(f'[*] {email}')
                            EMAILS.append(email)
                CRAWLED.add(url)
                if arguments.output:
                    file.write(f'{url}\n')
            else:
                CRAWLED.add(url)
                if arguments.output:
                    file.write(f'{url}\n')
        else:
            print("Done!!!\n")
            break

def crawlb():
    html = request(url)
    if html:
        if arguments.links == 'True':
                print('[*] Mode link activated\n')
                links = get_links(html)
                for link in links:
                     print(f'[*] {link}\n')
                     if arguments.output:
                        file.write(f'{link}\n')
        
        if arguments.emails == 'True':
            print('[*] Mode email activated\n')
            emails = get_emails(html)
            for email in emails:
                if email not in EMAILS:
                    print(f'[*] {email}\n')
                    if arguments.output:
                        file.write(f'{email}\n')
        if arguments.output:
                    file.write(f'{url}\n')
                    print('[*] Writing output...\n')
    else:
        if arguments.output:
                    file.write(f'{url}\n')
                    print('[*] Writing output...\n')
if __name__ == "__main__":
    url = arguments.hosts
    TO_CRAWL.append(url)
    if arguments.cont == 'True':
        crawl()
    else:
        crawlb()
        print('[*] Done!!!\n')
    print(Fore.RESET + '-' * 65)

Scripts recomendados

baixador em python 2.0

Google scan com interface gráfica

hcrawler 3.0

Script em Python 3.6 para fazer scrape de uma URL exportando métricas no formato Prometheus

Consumo de API aberta IBGE

Comentários

Nenhum comentário foi encontrado.