Parsing Arbitrary Log Data for Logs Per Second and Size Datav
I recently encountered a scenario requiring me to parse some data that was not available during a proof-of-concept and in a non-standard format. There may be a multitude of tools and approaches to achieve this but the fastest I could think of at the time was by using Python.
Here are the steps I took to begin processing this data.
Analyse the log source data and figure out what contents I needed to achieve my objective.
The file structure made this easy as a single day was represented in the filename. Luckily my use case didn’t require localising the timezone as the data was in UTC.
FileName.@2020010100001TZDATA.extension
My next step was to analyse the file contents to detect what and where the data I needed was.
Super simple – I only needed the data and this was right at the beginning of the file and was space-delimited:
1560053187.668 ....
There was some extra data in each file’s header describing the source and structure and was prefixed with a #
Well, that was easy, even if it wasn’t we would have found a way with the power of Python.
Let’s get started.
# I didn't know I'd need these imports when I started so don't be detered.
from datetime import datetime
from collections import Counter
from statistics import mean
import operator
# Definitely knew I needed these.
import os
import re
import csv
import sy
# I'm going to regex the filenames for the specified date so I'll compile a regex match.
file_match = re.compile('^FileName\.\@20190610.*') # Starts with FileName.@ + Date I Want + Whatever
# The data I needed was in two folders so I created an array with the paths in case I needed to re-use the code.
data_folders = ['X:/Folder1', 'X:/Folder2']
# Set the file size count to zero.
total_size = 0
# Because I want to use the data_folders variable the definable variable I setup a dict to add the folders to. I will later use this dict to perform all my operations on. I really need to learn pandas.
directories = {}
So we’ve set up everything we need to reference the data and work with what we need. We have to start iterating, matching, and all that fun stuff.
# Move through folders
for data_folder in data_folders:
# Add the folder dict to the root dict gigitty
directories[data_folder] = {}
# List folder contents
for file in os.listdir(data_folder):
# Get files matching our defined string
if file_match.match(file):
# Add the matched file' size to total size
total_size += os.path.getsize(data_folder+'/'+file)
# Open the file for reading
with open(data_folder+'/'+file) as logfile:
# Read as CSV
time_line = csv.reader(logfile, delimiter=' ')
# Set base values for iterations
second = 0
count = 0
started = 0
# Iterate through each row in file
for row in time_line:
# Create matches to match against entries to avoid weird s!*%
comment_match = re.compile('^#.*')
int_match = re.compile('^\d.*')
# Check the line against the above matches
if not comment_match.match(row[0]) and int_match.match(row[0]):
# Convert the UNIX epoch to a datetime object
second = dt.second
hour = str(dt.hour)
minute = str(dt.minute)
# second = str(dt.second)
dt = datetime.fromtimestamp(float(row[0]))
current_time = "{0}:{1}:{2}".format(hour, minute, str(second))
# If the second has changed reset the acount and update the current second
if second == dt.second:
count += 1
else:
directories[data_folder][current_time] = count
second = dt.second
count = 0
# Create and empty counter dict object and then start adding the calculated data together.
second_count_results = Counter({})
for directory, times in directories.items():
second_count_results += Counter(times)
# Calculate the total size of the logs together.
total_log_size_mb = total_size * 0.000001
# Max logs per second that we encountered.
max_lps = max(second_count_results.items(), key=operator.itemgetter(1))[1]
# Minimum logs per second that we encountered.
min_lps = min(second_count_results.items(), key=operator.itemgetter(1))[1]
# Find the average LPS.
list_of_lps = [v for k, v in second_count_results.items()]
average_lps = mean(list_of_lps)
# Print to STDIO the data we figured out in our heads.
print('Max lps observed: ' + str(max_lps) + ' min lps observed: ' + str(min_lps))
print('Size of the date we caclulated: ' + str(total_log_size_mb))
print('Average LPS was: ' + str(average_lps))
# Print and save the results.
with open('proxy_lps_data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for increment in sorted(second_count_results.items()):
timestamp = str(increment[0])
count = str(increment[1])
writer.writerow([timestamp, str(count)])
print(timestamp + ': ' + str(count))
# Save the calculation metdata.
with open('proxy_metrics_data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['size', str(total_log_size_mb)])
writer.writerow(['min_lps', str(min_lps)])
writer.writerow(['max_lps', str(max_lps)])
writer.writerow(['avg_lps', str(average_lps)])