#!/usr/bin/env python '''A script to check a FreeBSD packages installation, notably fix the +REQUIRED_BY files, update the origins to the most recent or last valid version appearing in the MOVED file, detect absent dependencies, and "leaf" packages which are not required and so are candidates for removal.''' # Author: Michel Talon # Version 1.0 # Licence: BSD (revised) # Date: February, March 2007 import os, re, time, mmap # The following illustrates the timings involved. # The first run is longer, around 10s, after that files are in cache. # rose% time ./check_pkg.py # There are now 360 packages installed. # ./check_pkg.py 0,28s user 0,01s system 99% cpu 0,300 total ###### Variables to be user customized ############## portdir = "/usr/ports/" # can be replaced for non standard installs pkgdir = "/var/db/pkg/" # idem conn_comp_discover = True # Do we discover connected components? write_graphviz = True # Do we display them graphically? do_fix_packages = False # Do we fix package database? Needs to be root. ###### Some preparation ############################ ###### A few global variables ## matchversion = re.compile('-([a-zA-Z0-9\._,]+)$') # matches a version number inst_pkgs = {} # installed packages indexed by origin moved = {} # all origin names movements from MOVED file last_valid_mov = {} # last valid name of removed ports checklog = open('CheckLog','w') ################################# # A utility function. def print_by_3(array, escapenl = False) : '''Prints 3 elements of array by line. The result is a string enclosed in quotes, with white space separated elements, suitable for a shell script.''' p_string = "" sh_array = [] # short array index = 0 len_array = len(array) def print_lte_3(sh_array, escapenl, end = False) : "Print a single line with less or equal 3 elements." if end : eol = "'\n" else : if not escapenl : eol = " \n" else: eol = " \\\n" return " ".join(sh_array) + eol p_string +="'" # beginning, print opening ' if len_array == 0 : p_string += "'\n" # empty array, return empty string return p_string # get out for elem in array : # array is not empty sh_array.append(elem) index +=1 if index%3 == 0 and index < len_array : # print by 3 p_string += print_lte_3(sh_array, escapenl, False) sh_array = [] elif index >= len_array : # last line complete or incomplete p_string += print_lte_3(sh_array, escapenl, True) return p_string ##### Get the list of installed packages ########### # This is very easy since each package corresponds to a directory under # pkgdir. However if using portupgrade, an extraneous pkgd.db appears. The # origin of packages is then obtained parsing +CONTENTS. def fill_inst_pkgs() : "Fill the dictionary inst_pkgs from pkg database." ori_string = '@comment ORIGIN:' dep_string = '@pkgdep' deporigin_string = '@comment DEPORIGIN:' md5_string = '@comment MD5:' inst_pkgs_list = os.listdir(pkgdir) try : inst_pkgs_list.remove('pkgdb.db') except ValueError : pass print "There are now %d packages installed." % len(inst_pkgs_list) for pkg in inst_pkgs_list : all_deps = [] one_dep = [] for line in file(pkgdir + pkg + '/+CONTENTS') : # Lines for dependencies occur in couples, the first being a # dependency and the second its origin. if not line.find(ori_string, 0) : # Picks origin pkg_orig = line.replace(ori_string, '').strip() inst_pkgs[pkg_orig] = [pkg, '',''] if not line.find(dep_string, 0) : # Picks dependency dep = line.replace(dep_string, '').strip() one_dep.append(dep) if not line.find(deporigin_string, 0) : # Picks its origin dep = line.replace(deporigin_string, '').strip() one_dep.append(dep) # Now has two elements all_deps.append(one_dep) # Registers them one_dep = [] # Reset for next dependency if not line.find(md5_string, 0) : # Picks first MD5 line break # To end of file. # In case there are no MD5 line, go to end of file all_deps.sort() inst_pkgs[pkg_orig] = [pkg, all_deps, []] # Now inst_pkgs contains all run time dependencies of all installed packages, # indexed by the origins recorded in the package database. We first need to # update the origins using the MOVED file. This requires to first fill the # moved and last_valid_mov dictionaries. def fill_moved() : "Fills the 'moved' dictionary from entries in 'MOVED' " mov_f = file(portdir + 'MOVED','r') lineno = 0 mov1 = {} for line in mov_f : if line.find('#', 0, 1) == -1 : # Discard comments at beginning fields = line.split('|') try : mov1[fields[0]] = [fields[1], lineno] lineno += 1 except IndexError : pass # In case of empty lines. It occurred. # Recursively extend this to a new dictionary. We follow MOVED recursively, # but only with increasing lineno. In particular this breaks loops, except # when a name becomes itself. We want to transform a name systematically in # its most recent version, so if a -> b -> a -> c we want a -> c, b-> c. # Hence there is no need to remember the a -> b transformation, and we can use # a dictionary storing m[a] = b, m[b] = a, m[a] = c without worrying that the # second assignement to m[a] overwrites the first. def follow_moved(orig) : "Follows an origin through MOVED." # The result is the most recent origin for a software, if the software # still exists, or '' if the port was marked 'Removed'. n_orig = orig o_lineno = - 1 while True : try : [m_orig, lineno] = mov1[n_orig] # Here lineno is the line where n_orig appears if m_orig == '' : # removed port # Fill last_valid_mov last_valid_mov[orig] = n_orig return '' elif m_orig == n_orig : # protects against a -> a lines in MOVED return n_orig else: # This is the recursive step if lineno > o_lineno : # Only move forward n_orig = m_orig o_lineno = lineno else : # Break out of recursion, normal exit. return n_orig except KeyError : # Either orig does not appear in MOVED or n_origin has no further # move, normal exit here! return n_orig # Compute once and save in the "moved" dictionary. Note that moved.keys() # has all the names which appeared at least once in MOVED as origins of a # move. So it is in principle complete. Moreover ports which have been # removed are catched in follow_moved() and added to last_valid_mov. for orig in mov1.keys() : moved[orig] = follow_moved(orig) # Check and update the origins of installed packages. def fix_orig() : "Follows recursively ports mentioned in MOVED." removed = [] for orig in inst_pkgs.keys() : # Each origin must be tested for possible move or deletion if orig in moved.keys() : n_orig = moved[orig] if n_orig == '' : # Removed port removed.append(orig) else : # Moved port if n_orig <> orig : # Protects against null moves. inst_pkgs[n_orig] = inst_pkgs.pop(orig) # Replace key orig else : n_orig = orig # This is because next test is uniformly on n_orig # MOVED collection may be incomplete, add a second check. In fact if # an obsolete port remains, it will crash our index generation. path = portdir + n_orig + '/Makefile' if not os.access(path, os.F_OK) and not n_orig in removed : removed.append(n_orig) print >> checklog, "Port", n_orig, "doesn't really exist." for orig in removed : # Note that, up to now, these ports have kept their old origin, so we # update the origin using last_valid_mov. try : n_orig = last_valid_mov[orig] inst_pkgs[n_orig] = inst_pkgs.pop(orig) # Replace key orig except KeyError : # orig is in removed but not in last_valid_mov. Should not happen, # in this case don't change origin n_orig = orig # Now we must update the origins of all mentioned dependencies in the # dictionary inst_pkgs. for orig in inst_pkgs.keys() : for dep in inst_pkgs[orig][1] : # For each dependency try : n_orig = moved[dep[1]] except KeyError : # Origin has disappeared try : n_orig = last_valid_mov[dep[1]] except KeyError : # Origin did not change n_orig = dep[1] dep[1] = n_orig # Update origin # Now inst_pkgs contains installed packages indexed by most recent or last # valid origin, and for each one all run dependencies. Moreover all these # dependencies are given with their updated origin. To fill "required_by" # we simply simulate installation of each package, that is, ascribe this # package to the "required_by" list of all packages mentioned as dependencies. def add_req_by_deps(orig) : "Adds the package of origin orig to all dependants required_by fields." for dep in inst_pkgs[orig][1] : # Pick its dependencies d_orig = dep[1] # d_orig is required by orig pkg = inst_pkgs[orig][0] # current package name try : if pkg not in inst_pkgs[d_orig][2] : # if not listed in required_by inst_pkgs[d_orig][2].append(pkg) # add pkg to required_by for d_orig except : print "Package", pkg, "cannot be registered as required_by of", d_orig print >> checklog, "Package", pkg, "cannot be registered on", d_orig # Now inst_pkgs is completely filled. Based on it we determine the connected # components in installed packages. We use Tarjan's algorithm in Cormen # Leiserson, Rivest and Stein "Introduction to algorithms.", chapter 21. This # requires to keep track for each node, of its parent and its rank. Nodes are # first installed as singletons and then sets are merged when links connecting # them are discovered. To do that one goes up to the parent of each set, and # choose as common parent the one with higher rank, or if ranks are equal, # choose one and increase rank. We profit of the search of the parent for # flattening the parent relationship. def conn_comp () : "Decompose the set of packages in connected components." tarj = {} def make_set(x) : # x is a node "Initially put each node in a singleton." tarj[x] = [x, 0] # initially x has parent x and rank 0 def link(x, y) : # to be used by parents of two sets "Merges two sets when there is a link between them." # As a side effect this computes new values for ranks, which started # at 0. Rank is basically height of the trees rooted at x. if tarj[x][1] > tarj[y][1] : # if rank(x) > rank(y) tarj[y][0] = x # x becomes parent of y else : tarj[x][0] = y # y becomes parent of x if tarj[x][1] == tarj[y][1] : # equal rank ? tarj[y][1] += 1 # increase rank of parent def find_set(x) : "Finds the parent which determines the set to which x belongs." # find_set returns the parent of x. Recursively it returns the parent # of the parent, etc. so gives the common parent of the set. As a side # effect it modifies the parent of x to be the common parent of the # set, this is "path compression" or flattening. Recursion is stopped # when we hit the top node. if x <> tarj[x][0] : # don't do that on the top node tarj[x][0] = find_set(tarj[x][0]) # recurse and flatten return tarj[x][0] # The algorithm now consists in first applying make_set to each node, and # second, for each arrow apply the link function to the parents of the # sets to which the ends of the arrow belong. We apply this to our case. for orig in inst_pkgs.keys() : make_set(orig) # Fills the tarj dictionary # Now discover arrows and apply gradual merging of sets for orig in inst_pkgs.keys() : for d_orig in [dep[1] for dep in inst_pkgs[orig][1]] : # dependencies origs try : link(find_set(orig), find_set(d_orig)) except KeyError : print "Dependency", d_orig, "of", orig, "not installed." # All arrows orig -> d_orig have been discovered, connected components are # determined by their parents. components = {} for orig in inst_pkgs.keys() : # Discover parents of components and fill components. try : # parent already registered components[find_set(orig)].append(orig) # Fill component except : # first register parent components[find_set(orig)] = [] components[find_set(orig)].append(orig) # Fill component # Finally display the result. singletons = [] print >> checklog, "Connected components" print >> checklog, "\n" for p_orig in components.keys() : # For each parent if len(components[p_orig]) > 1 : print >> checklog, print_by_3([inst_pkgs[orig][0] for orig in components[p_orig]]) print >> checklog, '\n' else : singletons.append(inst_pkgs[components[p_orig][0]][0]) print >> checklog, "Singletons" print >> checklog, print_by_3(singletons) return components def write_dotfile(component, file) : "Write a graphviz file for a connected component." fileh = open(file, "w") print >> fileh, "digraph G {" for index in range(len(component)) : orig = component[index] print >> fileh, str(index) + \ ' [label="' + inst_pkgs[orig][0] + '",color=red]' for index in range(len(component)) : orig = component[index] for d_orig in [dep[1] for dep in inst_pkgs[orig][1]] : # all arrows try : print >> fileh, str(index) + ' -> ' + \ str(component.index(d_orig)) except ValueError : pass print >> fileh, "}" fileh.close() def clean_and_print(listports) : "Cleanup and printing." # We display, for each installed package, its most recent or last valid # origin, according to the MOVED file, all the packages it requires and # the packages which require it. This last information being based on the # +CONTENTS file, of course, not on a computation from the ports tree. print >> checklog, "\n" print >> checklog, "Installed packages." print >> checklog, "\n" for orig in listports : print >> checklog, orig, " == > ", inst_pkgs[orig][0] print >> checklog, "Requires:" print >> checklog, print_by_3([dep[0] for dep in inst_pkgs[orig][1]]) inst_pkgs[orig][2].sort() print >> checklog, "Required by:" print >> checklog, print_by_3(inst_pkgs[orig][2]) print >> checklog, "\n" print >> checklog, "\n" print >> checklog, "Leaf packages." print >> checklog, "\n" # Leafs are packages which are not required by any other package. Hence # two cases may occur. Either they have been installed in the past as # dependency of some other package and are no more required presently. # They can be safely deleted. Or, on the contrary, they have been # installed deliberately by the end user, who probably remembers why, and # can thus keep them. leafs = [] for orig in listports : if len(inst_pkgs[orig][2]) == 0 : leafs.append(inst_pkgs[orig][0]) leafs.sort() print >> checklog, print_by_3(leafs) print >> checklog, "\n" checklog.flush() # Checking the +REQUIRED_BY files. For each installed package, check that it # has a +REQUIRED_BY if and only if it is not a leaf. In this case check that # the contents of the +REQUIRED_BY file is coherent with the required_by # fields we have computed, the most normal case should be differing version # numbers. def check_required(orig) : "Checks existence of required_by file." pkg = inst_pkgs[orig][0] req_file = pkgdir + pkg + '/+REQUIRED_BY' # No requirements if len(inst_pkgs[orig][2]) == 0 : if os.access(req_file, os.F_OK) : print >> checklog, "Package", pkg, "has a +REQUIRED_BY file \ but is not required." return False # Usual case, has requirements if not os.access(req_file, os.F_OK) : print >> checklog, "Package", pkg, "doesn't have a +REQUIRED_BY file \ while it is required." print >> checklog, "required by", " ".join(inst_pkgs[orig][2]) print >> checklog, "\n" return False return True def check_req_file(orig) : "Checks content of required_by file." # The difference between the +REQUIRED_BY file present on the machine and # the one we have computed should be limited to differences in version # numbers. So we strip package names of version number before comparing. pkg = inst_pkgs[orig][0] req_file = pkgdir + pkg + '/+REQUIRED_BY' # Read required_by file. fhandle = open(req_file, 'r') contents = [] strip_cont = [] for line in fhandle : dep = line.strip() strip_dep = matchversion.sub('', dep) contents.append(dep) strip_cont.append(strip_dep) fhandle.close() contents.sort() strip_cont.sort() req_list = inst_pkgs[orig][2] strip_req_list = [matchversion.sub('', dep) for dep in req_list] # Compute the set difference between the two stripped lists. sym_d = set.symmetric_difference(set(strip_req_list),set(strip_cont)) # If there is a difference between stripped dependencies, there is a real # registration problem. if len(sym_d) : print >> checklog, "Package", pkg, "has a bogus +REQUIRED_BY file." print >> checklog, "Dependencies", " ".join(list(sym_d)), "are either \ bogus or not registered in the file." print >> checklog, "\n" return 'Bogus' # Dependencies match , but perhaps with incorrect version numbers. problems = [] for dep in contents : if dep not in req_list : # First compare versioned dependencies. problems.append(dep) for dep in problems : strip_dep = matchversion.sub('', dep) index = strip_req_list.index(strip_dep) # We know this works ndep = req_list[index] print >> checklog, "Package", pkg, "has registered dependency", dep, "\ which should be updated to", ndep return 'Fine' def fix_pkg_database() : "Updates the origins in +CONTENTS and overwrites the +REQUIRED_BY files." # This should be run only after having checked the log in CheckLog. # First update the origin in the +CONTENTS file. ori_string = '@comment ORIGIN:' for orig in inst_pkgs.keys() : pkg = inst_pkgs[orig][0] # Fix origin. cont_file = pkgdir + pkg + '/+CONTENTS' fho = open(cont_file, 'r') size = os.stat(cont_file)[6] # size of file fhod = fho.fileno() # descriptor # Locate the ORIGIN string in the first 256 bytes. We need a read-only mmap. trunc_size = 256 if size < 256 : trunc_size = size buff = mmap.mmap(fhod, trunc_size, mmap.MAP_PRIVATE, mmap.PROT_READ) pos1 = buff.find(ori_string) buff.seek(pos1) # beginning of orig line. orig_line = buff.readline() # discard orig line pos2 = buff.tell() # beginning of next line. buff.close() # No more need of the mmap. # Check if orig needs to be changed. old_orig = orig_line.replace(ori_string, '').strip() if old_orig <> orig : fhn = open(cont_file + '.new', 'w') fhnd = fhn.fileno() str = os.read(fhod, pos1) # head of contents file os.write(fhnd, str) # writes head os.write(fhnd, ori_string + orig + '\n') # Updates origin os.lseek(fhod, pos2, 0) # positions the file pointer str = os.read(fhod, size - pos2) # reads tail os.write(fhnd, str) # writes tail fhn.close() fho.close() # Move new file in place os.rename(cont_file + '.new', cont_file) else: fho.close() req_file = pkgdir + pkg + '/+REQUIRED_BY' # Case of leaf packages. Don't write a required_by file. # But remove a bogus file. if not len(inst_pkgs[orig][2]) : if os.access(req_file, os.F_OK) : os.unlink(req_file) continue # To next package # General case, write a new required_by file. fhn = open(req_file + '.new', 'w') for dep in inst_pkgs[orig][2] : print >> fhn, dep fhn.close() # Move new file in place. os.rename(req_file + '.new', req_file) if __name__ == '__main__' : # Prepare the necessary data structures. tstart = time.time() fill_inst_pkgs() # Installed packages. fill_moved() # Contents of the moved file. fix_orig() # Use that to update the origins. # Computes and prints the state of the package database for orig in inst_pkgs.keys() : # For each package add_req_by_deps(orig) # add to required_by of its dependencies listports = inst_pkgs.keys() listports.sort() # Will be sorted by categories clean_and_print(listports) if conn_comp_discover : components = conn_comp() # Determines connected components. if write_graphviz : index = 1 for parent in components.keys() : component = components[parent] if len(component) > 1 : file = "graph" + str(index) + ".dot" write_dotfile(component, file) index += 1 # Finally do the checking and fixing work for orig in listports : # Check the existing +REQUIRED_BY file if check_required(orig) : check_req_file(orig) checklog.flush() if do_fix_packages : # if required fix the package database fix_pkg_database() print "Package database fixed." else : print "Analysis of package database terminated, see log in CheckLog." print "Total time spent :", time.time() - tstart