DNS와 IP 정보 얻기

DNS란?
IP정보?

nslookup 예시

https://findip.net/

# SCRAP DNS
import numpy as np
import pandas as pd
path="C:/Users/User/Downloads/2024_상품추천_scrapping/"
# Define the file path
file_path = 'C:/Users/User/Downloads/KED/[기업사업부-2024-DB0021][LG유플러스] 2023년 4분기DB CD제공 추출요청/1. 전체자료.txt'
 
# Define the column names that you want to load
columns_to_load = [0, 10]
 
# Define the chunk size
chunk_size = 100000  # Number of rows per chunk
 
# Create an empty DataFrame to hold chunks
df_chunks = []
i = 0
# Read the file in chunks
for chunk in pd.read_csv(file_path, sep='|', usecols=columns_to_load,
                         chunksize=chunk_size, encoding="mbcs", header=None, dtype=str):
    i=i+1
    print(i*chunk_size)
    # Process each chunk (if necessary)
    chunk = chunk.dropna()
    # Append the chunk to the list
    df_chunks.append(chunk)
 
# Concatenate the chunks together
data = pd.concat(df_chunks, ignore_index=True)
 
data.columns = ['KEDCD','URL']
data.to_csv(path+'01_urls.csv', index=False)
 
import dns.resolver
 
def get_dns_server_name(domain):
    try:
        # Extract the top-level domain
        tld = domain.split("//")[-1].split("/")[0].split('?')[0]
        # Query for NS records
        answers = dns.resolver.resolve(tld, 'NS')
        # Return the first authoritative name server
        return str(answers[0])
    except Exception as e:
        return str(e)
#get_dns_server_name(data['URL'][1])
 
from datetime import datetime
start_time = datetime.now()
print(f"Start Time: {start_time}")
 
data['dns_server_name'] = data['URL'].apply(get_dns_server_name)
 
end_time = datetime.now()
print(f"End Time: {end_time}")
data.to_csv(path+'02_urls_with_dns.csv', index=False)
 
 
 
 
 
import socket
 
def get_dns_server(url):
    try:
        # Extract hostname from URL
        hostname = url.split("//")[-1].split("/")[0].split('?')[0]
        # Get IP address of the hostname
        ip_address = socket.gethostbyname(hostname)
        return ip_address
    except Exception as e:
        return str(e)
 
get_dns_server(data['URL'][0])
 
start_time = datetime.now()
print(f"Start Time: {start_time}")
data['dns_server_ip'] = data['URL'].apply(get_dns_server)
end_time = datetime.now()
print(f"End Time: {end_time}")
 
 
 
data.to_csv(path+'03 urls_with_ip.csv', index=False)
 
 
 
 
 
import requests
 
#url = 'https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53'
#params ={'serviceKey' : '서비스키', 'query' : 'kisa.or.kr', 'answer' : 'xml' }
 
 
def get_ip_info(ip):
    #https://www.findip.net/Main
    url = 'https://api.findip.net/'+ip+'/'
    params = {'token': 'a1a12781d7ef4a8db32fccc9d5982b53'}
    #url='	https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53'
    response = requests.get(url, params=params,  verify=False)
    return response.content
    #print(response.content)
 
import warnings
warnings.filterwarnings("ignore")
 
start_time = datetime.now()
print(f"Start Time: {start_time}")
#data['ip_info'] = data['URL'].apply(get_ip_info)
data['ip_info'] = ''
 
 
 
for i in range(171946,len(data)):
    print (i)
    data.loc[i,'ip_info'] = get_ip_info(data.loc[i,'dns_server_ip'])
    print (data.loc[i,'ip_info'])
end_time = datetime.now()
print(f"End Time: {end_time}")
 
 
data.to_csv(path+'04 urls_with_ip_info.csv', index=False)
 
 
 
import json
from json.decoder import JSONDecodeError
err_list = []
#for i in range(60784,len(data)):
for i in err_list:
    try:
        tmp = json.loads(data.loc[i, 'ip_info'])
        print(data.loc[i, 'ip_info'])
        #print('City Name:', tmp['city']['names']['en'])
        #print('Continent Code:', tmp['continent']['code'])
        #print('Country Name:', tmp['country']['names']['en'])
        #print('Latitude:', tmp['location']['latitude'])
        #print('Longitude:', tmp['location']['longitude'])
        #print('Time Zone:', tmp['location']['time_zone'])
        #print('Weather Code:', tmp['location']['weather_code'])
 
        data.loc[i,'city'] = tmp['city']['names']['en']
        data.loc[i, 'continent'] = tmp['continent']['code']
        data.loc[i, 'country'] = tmp['country']['names']['en']
        data.loc[i, 'lat'] = tmp['location']['latitude']
        data.loc[i, 'lon'] = tmp['location']['longitude']
        data.loc[i, 'time_zone'] = tmp['location']['time_zone']
 
        subd = ''
        for subdivision in tmp['subdivisions']:
            if 'en' in subdivision['names']:
                #print('Subdivision Name:', subdivision['names']['en'])
                subd = subd + '/' + subdivision['names']['en']
        data.loc[i, 'subdivision'] = subd
 
        #print('Autonomous System Number:', tmp['traits']['autonomous_system_number'])
        #print('Autonomous System Organization:', tmp['traits']['autonomous_system_organization'])
        #print('Connection Type:', tmp['traits']['connection_type'])
        #print('ISP:', tmp['traits']['isp'])
        #print('User Type:', tmp['traits']['user_type'])
 
        data.loc[i,'as_number'] = tmp['traits']['autonomous_system_number']
        data.loc[i,'as_org'] = tmp['traits']['autonomous_system_organization']
        data.loc[i,'connection_type'] = tmp['traits']['connection_type']
        data.loc[i,'isp'] = tmp['traits']['isp']
        data.loc[i,'user_type'] = tmp['traits']['user_type']
    except JSONDecodeError as e:
        print('Error--')
        data.loc[i, 'ip_info'] = get_ip_info(data.loc[i, 'dns_server_ip'])
        print(data.loc[i, 'ip_info'])
        err_list.append(i)
    except KeyError:
        print('Error')
    except TypeError:
        print('Error-')
 
 
 
data.to_csv(path+'06 urls_with_ip_info_orgez.csv', index=False)
 
 
 
data = pd.read_csv(path+'06 urls_with_ip_info_orgez.csv', dtype='str')
 
import re
import numpy as np
# Function to extract based on the specified condition
def extract_domain(row):
    url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    regex = re.compile(url_pattern)
    if regex.match(row):
        if row.endswith('.co.kr.') or row.endswith('.or.kr.') or row.endswith('.go.kr.') or row.endswith('.co.uk.') or row.endswith('.ne.kr.') :
            match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$')
        else:
            match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$')
        return match.iloc[0, 0]
    else:
        return np.nan
# #url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# filtered_df = data[data['dns_server_name'].str.contains(url_pattern, flags=re.IGNORECASE, na=False)]
# filtered_df.groupby('dns_server_name').count()
#
# # Apply the function to the DataFrame
# filtered_df['extracted_domain'] = filtered_df['dns_server_name'].apply(extract_domain)
# filtered_df.groupby('extracted_domain').count()
# filtered_df.to_csv(path+'07 url_with_ip_info_orgez_dns_exists_filtered.csv', index=False)
 
 
 
 
 
 
data['extracted_domain'] = data['dns_server_name'].apply(extract_domain)
data.to_csv(path+'07 url_with_ip_info_orgez_dns_2nd_level.csv', index=False)
 
data.head()
 
data2 = data.drop('ip_info',axis=1)
 
data2.to_csv(path+'08 url_with_ip_info_orgez_dns_2nd_level_wo_ip_info.csv', index=False)