====== DNS와 IP 정보 얻기 ====== DNS란? IP정보? nslookup 예시 https://findip.net/ # SCRAP DNS import numpy as np import pandas as pd path="C:/Users/User/Downloads/2024_상품추천_scrapping/" # Define the file path file_path = 'C:/Users/User/Downloads/KED/[기업사업부-2024-DB0021][LG유플러스] 2023년 4분기DB CD제공 추출요청/1. 전체자료.txt' # Define the column names that you want to load columns_to_load = [0, 10] # Define the chunk size chunk_size = 100000 # Number of rows per chunk # Create an empty DataFrame to hold chunks df_chunks = [] i = 0 # Read the file in chunks for chunk in pd.read_csv(file_path, sep='|', usecols=columns_to_load, chunksize=chunk_size, encoding="mbcs", header=None, dtype=str): i=i+1 print(i*chunk_size) # Process each chunk (if necessary) chunk = chunk.dropna() # Append the chunk to the list df_chunks.append(chunk) # Concatenate the chunks together data = pd.concat(df_chunks, ignore_index=True) data.columns = ['KEDCD','URL'] data.to_csv(path+'01_urls.csv', index=False) import dns.resolver def get_dns_server_name(domain): try: # Extract the top-level domain tld = domain.split("//")[-1].split("/")[0].split('?')[0] # Query for NS records answers = dns.resolver.resolve(tld, 'NS') # Return the first authoritative name server return str(answers[0]) except Exception as e: return str(e) #get_dns_server_name(data['URL'][1]) from datetime import datetime start_time = datetime.now() print(f"Start Time: {start_time}") data['dns_server_name'] = data['URL'].apply(get_dns_server_name) end_time = datetime.now() print(f"End Time: {end_time}") data.to_csv(path+'02_urls_with_dns.csv', index=False) import socket def get_dns_server(url): try: # Extract hostname from URL hostname = url.split("//")[-1].split("/")[0].split('?')[0] # Get IP address of the hostname ip_address = socket.gethostbyname(hostname) return ip_address except Exception as e: return str(e) get_dns_server(data['URL'][0]) start_time = datetime.now() print(f"Start Time: {start_time}") data['dns_server_ip'] = data['URL'].apply(get_dns_server) end_time = datetime.now() print(f"End Time: {end_time}") data.to_csv(path+'03 urls_with_ip.csv', index=False) import requests #url = 'https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53' #params ={'serviceKey' : '서비스키', 'query' : 'kisa.or.kr', 'answer' : 'xml' } def get_ip_info(ip): #https://www.findip.net/Main url = 'https://api.findip.net/'+ip+'/' params = {'token': 'a1a12781d7ef4a8db32fccc9d5982b53'} #url=' https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53' response = requests.get(url, params=params, verify=False) return response.content #print(response.content) import warnings warnings.filterwarnings("ignore") start_time = datetime.now() print(f"Start Time: {start_time}") #data['ip_info'] = data['URL'].apply(get_ip_info) data['ip_info'] = '' for i in range(171946,len(data)): print (i) data.loc[i,'ip_info'] = get_ip_info(data.loc[i,'dns_server_ip']) print (data.loc[i,'ip_info']) end_time = datetime.now() print(f"End Time: {end_time}") data.to_csv(path+'04 urls_with_ip_info.csv', index=False) import json from json.decoder import JSONDecodeError err_list = [] #for i in range(60784,len(data)): for i in err_list: try: tmp = json.loads(data.loc[i, 'ip_info']) print(data.loc[i, 'ip_info']) #print('City Name:', tmp['city']['names']['en']) #print('Continent Code:', tmp['continent']['code']) #print('Country Name:', tmp['country']['names']['en']) #print('Latitude:', tmp['location']['latitude']) #print('Longitude:', tmp['location']['longitude']) #print('Time Zone:', tmp['location']['time_zone']) #print('Weather Code:', tmp['location']['weather_code']) data.loc[i,'city'] = tmp['city']['names']['en'] data.loc[i, 'continent'] = tmp['continent']['code'] data.loc[i, 'country'] = tmp['country']['names']['en'] data.loc[i, 'lat'] = tmp['location']['latitude'] data.loc[i, 'lon'] = tmp['location']['longitude'] data.loc[i, 'time_zone'] = tmp['location']['time_zone'] subd = '' for subdivision in tmp['subdivisions']: if 'en' in subdivision['names']: #print('Subdivision Name:', subdivision['names']['en']) subd = subd + '/' + subdivision['names']['en'] data.loc[i, 'subdivision'] = subd #print('Autonomous System Number:', tmp['traits']['autonomous_system_number']) #print('Autonomous System Organization:', tmp['traits']['autonomous_system_organization']) #print('Connection Type:', tmp['traits']['connection_type']) #print('ISP:', tmp['traits']['isp']) #print('User Type:', tmp['traits']['user_type']) data.loc[i,'as_number'] = tmp['traits']['autonomous_system_number'] data.loc[i,'as_org'] = tmp['traits']['autonomous_system_organization'] data.loc[i,'connection_type'] = tmp['traits']['connection_type'] data.loc[i,'isp'] = tmp['traits']['isp'] data.loc[i,'user_type'] = tmp['traits']['user_type'] except JSONDecodeError as e: print('Error--') data.loc[i, 'ip_info'] = get_ip_info(data.loc[i, 'dns_server_ip']) print(data.loc[i, 'ip_info']) err_list.append(i) except KeyError: print('Error') except TypeError: print('Error-') data.to_csv(path+'06 urls_with_ip_info_orgez.csv', index=False) data = pd.read_csv(path+'06 urls_with_ip_info_orgez.csv', dtype='str') import re import numpy as np # Function to extract based on the specified condition def extract_domain(row): url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' regex = re.compile(url_pattern) if regex.match(row): if row.endswith('.co.kr.') or row.endswith('.or.kr.') or row.endswith('.go.kr.') or row.endswith('.co.uk.') or row.endswith('.ne.kr.') : match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$') else: match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$') return match.iloc[0, 0] else: return np.nan # #url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' # url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' # filtered_df = data[data['dns_server_name'].str.contains(url_pattern, flags=re.IGNORECASE, na=False)] # filtered_df.groupby('dns_server_name').count() # # # Apply the function to the DataFrame # filtered_df['extracted_domain'] = filtered_df['dns_server_name'].apply(extract_domain) # filtered_df.groupby('extracted_domain').count() # filtered_df.to_csv(path+'07 url_with_ip_info_orgez_dns_exists_filtered.csv', index=False) data['extracted_domain'] = data['dns_server_name'].apply(extract_domain) data.to_csv(path+'07 url_with_ip_info_orgez_dns_2nd_level.csv', index=False) data.head() data2 = data.drop('ip_info',axis=1) data2.to_csv(path+'08 url_with_ip_info_orgez_dns_2nd_level_wo_ip_info.csv', index=False) {{tag>data_analysis:work python IP정보 DNS 네임서버}} ~~DISCUSSION~~